### Implementing Adversarial Validation for Data Drift
Description: Create and train a classifier that distinguishes between train and test datasets, using the classifier’s performance to infer data drift.

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Simulate train data
X_train, y_train = make_classification(
    n_samples=1000, n_features=10, n_informative=8, random_state=42
)

# Simulate test data with slight distribution shift
X_test, y_test = make_classification(
    n_samples=1000, n_features=10, n_informative=8, shift=0.5, random_state=24
)

# Create combined dataset with label 0 for train, 1 for test
X_combined = np.vstack([X_train, X_test])
y_combined = np.hstack([np.zeros(len(X_train)), np.ones(len(X_test))])

# Split combined data into train and validation for adversarial classifier
X_adv_train, X_adv_val, y_adv_train, y_adv_val = train_test_split(
    X_combined, y_combined, test_size=0.3, random_state=42, stratify=y_combined
)

# Train classifier to distinguish train/test samples
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_adv_train, y_adv_train)

# Predict on validation set
y_pred_proba = clf.predict_proba(X_adv_val)[:, 1]

# Evaluate performance with ROC AUC
auc_score = roc_auc_score(y_adv_val, y_pred_proba)

print(f"Adversarial Validation ROC AUC: {auc_score:.4f}")

if auc_score > 0.75:
    print("⚠️ High classifier performance → Data drift likely present.")
else:
    print("✅ Low classifier performance → Data distributions are similar.")

Adversarial Validation ROC AUC: 0.9961
⚠️ High classifier performance → Data drift likely present.
