### Implementing Adversarial Validation for Data Drift
Description: Create and train a classifier that distinguishes between train and test datasets, using the classifier’s performance to infer data drift.

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Step 1: Create synthetic train and test datasets
np.random.seed(42)

# Train data: normal distribution
train_data = pd.DataFrame({
    'feature1': np.random.normal(0, 1, 1000),
    'feature2': np.random.normal(5, 2, 1000)
})

# Test data: slightly shifted distribution (simulate drift)
test_data = pd.DataFrame({
    'feature1': np.random.normal(0.5, 1, 1000),
    'feature2': np.random.normal(4.8, 2.2, 1000)
})

# Step 2: Label the data
train_data['is_test'] = 0
test_data['is_test'] = 1

# Step 3: Combine and shuffle
combined = pd.concat([train_data, test_data], ignore_index=True)
X = combined[['feature1', 'feature2']]
y = combined['is_test']

# Step 4: Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Train classifier to distinguish train vs test
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 6: Evaluate classifier performance
y_pred_proba = clf.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, y_pred_proba)

print(f"Adversarial Validation AUC: {auc_score:.4f}")

# Step 7: Interpret result
if auc_score > 0.7:
    print("Warning: Significant data drift detected between train and test sets.")
else:
    print("No significant data drift detected.")


Adversarial Validation AUC: 0.5893
No significant data drift detected.
