### Detect Data Drift in ML Models
**Objective**: Monitor and detect changes in data distributions that impact ML model performance.

**Task**: Categorical Feature Drift

**Steps**:
1. Load the baseline distribution for a categorical feature (e.g., gender ) from your training dataset.
2. Load the same feature from your current production data.
3. Use chi-squared tests to compare the distributions of the categorical feature.
4. Step 4: If significant drift is detected, investigate the cause and update the model as needed.

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# ---------------------- Drift Detection Function ----------------------

def detect_categorical_drift(baseline_df, production_df, column_name, alpha=0.05):
    # Check if dataframes are empty
    if baseline_df.empty:
        raise ValueError("Baseline DataFrame is empty.")
    if production_df.empty:
        raise ValueError("Production DataFrame is empty.")

    # Check for column presence
    if column_name not in baseline_df.columns:
        raise ValueError(f"'{column_name}' column not found in baseline data.")
    if column_name not in production_df.columns:
        raise ValueError(f"'{column_name}' column not found in production data.")

    # Ensure column contains categorical/string data
    if not pd.api.types.is_object_dtype(baseline_df[column_name]) or not pd.api.types.is_object_dtype(production_df[column_name]):
        raise ValueError(f"'{column_name}' column must contain categorical values (object/string dtype).")

    # Frequency counts
    baseline_counts = baseline_df[column_name].value_counts().sort_index()
    production_counts = production_df[column_name].value_counts().sort_index()

    # Align categories
    all_categories = sorted(set(baseline_counts.index).union(set(production_counts.index)))
    baseline_freq = [baseline_counts.get(cat, 0) for cat in all_categories]
    production_freq = [production_counts.get(cat, 0) for cat in all_categories]

    # Chi-squared test
    contingency_table = [baseline_freq, production_freq]
    chi2, p_value, _, _ = chi2_contingency(contingency_table)

    result = {
        "chi2_statistic": chi2,
        "p_value": p_value,
        "drift_detected": p_value < alpha,
        "baseline_distribution": dict(zip(all_categories, baseline_freq)),
        "production_distribution": dict(zip(all_categories, production_freq))
    }

    return result

# ---------------------- Test Cases ----------------------

def run_tests():
    print("Running test cases...\n")

    # Test 1: No Drift
    baseline = pd.DataFrame({
        'gender': np.random.choice(['Male', 'Female'], size=1000, p=[0.6, 0.4])
    })
    production1 = pd.DataFrame({
        'gender': np.random.choice(['Male', 'Female'], size=1000, p=[0.59, 0.41])
    })
    result1 = detect_categorical_drift(baseline, production1, 'gender')
    print("✅ Test 1 - No Significant Drift")
    print(result1, "\n")

    # Test 2: Drift Detected
    production2 = pd.DataFrame({
        'gender': np.random.choice(['Male', 'Female'], size=1000, p=[0.3, 0.7])
    })
    result2 = detect_categorical_drift(baseline, production2, 'gender')
    print("✅ Test 2 - Drift Detected")
    print(result2, "\n")

    # Test 3: Missing Column
    try:
        detect_categorical_drift(baseline, production2.rename(columns={'gender': 'sex'}), 'gender')
    except ValueError as e:
        print("⚠️ Test 3 - Missing Column Error Caught:", e, "\n")

    # Test 4: Non-Categorical Data
    try:
        numeric_baseline = pd.DataFrame({'gender': np.random.randn(100)})
        numeric_production = pd.DataFrame({'gender': np.random.randn(100)})
        detect_categorical_drift(numeric_baseline, numeric_production, 'gender')
    except ValueError as e:
        print("⚠️ Test 4 - Non-Categorical Column Error Caught:", e, "\n")

    # Test 5: Empty DataFrame
    try:
        detect_categorical_drift(pd.DataFrame(), production1, 'gender')
    except ValueError as e:
        print("⚠️ Test 5 - Empty DataFrame Error Caught:", e, "\n")

run_tests()

Running test cases...

✅ Test 1 - No Significant Drift
{'chi2_statistic': 0.16806722689075632, 'p_value': 0.6818352676908217, 'drift_detected': False, 'baseline_distribution': {'Female': 400, 'Male': 600}, 'production_distribution': {'Female': 410, 'Male': 590}} 

✅ Test 2 - Drift Detected
{'chi2_statistic': 174.44637856656323, 'p_value': 7.908687810724053e-40, 'drift_detected': True, 'baseline_distribution': {'Female': 400, 'Male': 600}, 'production_distribution': {'Female': 695, 'Male': 305}} 

⚠️ Test 3 - Missing Column Error Caught: 'gender' column not found in production data. 

⚠️ Test 4 - Non-Categorical Column Error Caught: 'gender' column must contain categorical values (object/string dtype). 

⚠️ Test 5 - Empty DataFrame Error Caught: Baseline DataFrame is empty. 

