## 10. Model Monitoring

### Import Necessary Libraries

In [5]:
# !pip install 'alibi-detect[tensorflow]'

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.stats import chi2_contingency
from alibi_detect.cd import KSDrift

In [None]:
# Load Training and Production Data from GitHub
train_url = "https://raw.githubusercontent.com/PatelNisarg28/MLOps_Adult_Income/main/datasets/adult_income_train.parquet"
prod_url = "https://raw.githubusercontent.com/PatelNisarg28/MLOps_Adult_Income/main/datasets/adult_income_prod.parquet"

# Read the datasets
train_df = pd.read_parquet(train_url)
prod_df = pd.read_parquet(prod_url)

# Ensure both datasets have the same structure
assert train_df.columns.equals(prod_df.columns), "Mismatch in dataset columns!"

# Select Numerical Features and Standardize Data
numerical_features = train_df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_df.select_dtypes(include=['object', 'category']).columns

# Check if numerical features exist
if numerical_features.empty:
    raise ValueError("No numerical features found in the dataset!")

# Standardize the numerical data
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[numerical_features])
prod_scaled = scaler.transform(prod_df[numerical_features])

# Initialize and Run the KSDrift Detection Test for Numerical Features
cd = KSDrift(train_scaled, p_val=0.05)  # Set significance level to 5%

# Perform drift detection
preds = cd.predict(prod_scaled)

# Print drift detection result for numerical features
drift_detected = preds['data']['is_drift']
print(f"Numerical Feature Drift detected: {'Yes' if drift_detected else 'No'}")

# Create a DataFrame to display Feature-Level Drift Results
numerical_drift_results = pd.DataFrame({
    "Feature": numerical_features,
    "p-value": preds["data"]["p_val"],
    "Drift Detected": np.array(preds["data"]["p_val"]) < 0.05
})

print("\nFeature-wise Numerical Drift Detection Results:")
print(numerical_drift_results)

# Chi-Square Test for Categorical Feature Drift
categorical_drift_results = []
for feature in categorical_features:
    train_counts = train_df[feature].value_counts(normalize=True)
    prod_counts = prod_df[feature].value_counts(normalize=True)
    
    # Align both distributions
    all_categories = set(train_counts.index).union(set(prod_counts.index))
    train_counts = train_counts.reindex(all_categories, fill_value=0)
    prod_counts = prod_counts.reindex(all_categories, fill_value=0)
    
    # Create contingency table
    contingency_table = np.array([train_counts, prod_counts]) * len(train_df)
    
    # Perform Chi-Square Test
    chi2_stat, p_val, _, _ = chi2_contingency(contingency_table)
    categorical_drift_results.append((feature, p_val, p_val < 0.05))

# Convert results to DataFrame
categorical_drift_results_df = pd.DataFrame(categorical_drift_results, columns=["Feature", "p-value", "Drift Detected"])

print("\nFeature-wise Categorical Drift Detection Results:")
print(categorical_drift_results_df)

# Final Interpretation
if drift_detected or (categorical_drift_results_df["Drift Detected"].any()):
    print("\nWarning: Data drift detected! Consider retraining the model.")
else:
    print("\nNo significant drift detected. Model is stable.")

Numerical Feature Drift detected: No

Feature-wise Numerical Drift Detection Results:
          Feature   p-value  Drift Detected
0             age  0.543973           False
1          fnlwgt  0.970623           False
2   education_num  0.786682           False
3    capital_gain  1.000000           False
4    capital_loss  1.000000           False
5  hours_per_week  0.972583           False

Feature-wise Categorical Drift Detection Results:
          Feature   p-value  Drift Detected
0       workclass  0.014682            True
1       education  0.171550           False
2  marital_status  0.756408           False
3      occupation  0.000132            True
4    relationship  0.054910           False
5            race  0.533803           False
6             sex  0.526395           False
7  native_country  0.007372            True
8          income  1.000000           False

