In [None]:
# -------------------------------------------
# Notebook 3 Model Monitoring & Drift Detection
# -------------------------------------------

import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp, chi2_contingency

# Ensure folders exist
os.makedirs("data", exist_ok=True)
os.makedirs("drift_reports", exist_ok=True)


In [None]:
# Load trained model
model_path = "models/random_forest_model.pkl"
if not os.path.exists(model_path):
    raise FileNotFoundError("Trained model not found. Please run Notebook 1 first.")

model = joblib.load(model_path)
print("Model loaded successfully.")

# Load the full training dataset
train_df = pd.read_csv("data/adult.csv")
train_df.replace('?', np.nan, inplace=True)
train_df.dropna(inplace=True)
train_df['income'] = train_df['income'].apply(lambda x: 1 if '>50K' in x else 0)
categorical_cols = train_df.select_dtypes(include='object').columns
train_df = pd.get_dummies(train_df, columns=categorical_cols)

X_train = train_df.drop("income", axis=1)


In [None]:
# Simulate "new" batch of unseen data
batch_df = pd.read_csv("data/adult.csv").sample(n=100, random_state=101).copy()

# Preprocess batch the same way
batch_df.replace('?', np.nan, inplace=True)
batch_df.dropna(inplace=True)
batch_df['income'] = batch_df['income'].apply(lambda x: 1 if '>50K' in x else 0)
categorical_cols = batch_df.select_dtypes(include='object').columns
batch_df = pd.get_dummies(batch_df, columns=categorical_cols)

# Align batch columns with training
batch_df = batch_df.reindex(columns=train_df.columns, fill_value=0)

X_batch = batch_df.drop("income", axis=1)


In [None]:
# Kolmogorov–Smirnov test for numerical drift
numerical_cols = X_train.select_dtypes(include=np.number).columns
ks_results = []

for col in numerical_cols:
    stat, p = ks_2samp(X_train[col], X_batch[col])
    ks_results.append((col, p))

# Flagging drift where p < 0.05
ks_drift = [(col, p) for col, p in ks_results if p < 0.05]
print(f"KS Drift Detected in {len(ks_drift)} of {len(numerical_cols)} features")

# Save KS drift report
ks_df = pd.DataFrame(ks_results, columns=['feature', 'p_value'])
ks_df['drift_detected'] = ks_df['p_value'] < 0.05
ks_df.to_csv("drift_reports/ks_drift_report.csv", index=False)


In [None]:
# Detect drift in categorical variables
cat_columns = [col for col in X_train.columns if 'native-country_' in col or 'sex_' in col or 'race_' in col]
chi2_results = []

for col in cat_columns:
    obs = pd.DataFrame({
        "train": X_train[col].value_counts(),
        "batch": X_batch[col].value_counts()
    }).fillna(0)

    stat, p, _, _ = chi2_contingency(obs.T)
    chi2_results.append((col, p))

# Flag drift
chi2_drift = [(col, p) for col, p in chi2_results if p < 0.05]
print(f"Chi2 Drift Detected in {len(chi2_drift)} of {len(cat_columns)} features")

# Save Chi-squared report
chi_df = pd.DataFrame(chi2_results, columns=['feature', 'p_value'])
chi_df['drift_detected'] = chi_df['p_value'] < 0.05
chi_df.to_csv("drift_reports/chi2_drift_report.csv", index=False)


In [None]:
# Visualize one example of drifted feature
if ks_drift:
    feature = ks_drift[0][0]
    plt.figure(figsize=(8, 5))
    sns.kdeplot(X_train[feature], label='Train')
    sns.kdeplot(X_batch[feature], label='Batch')
    plt.title(f"Drift Detected: {feature}")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"drift_reports/{feature}_drift.png")
    plt.show()
else:
    print("No drifted numerical features to visualize.")