In [1]:
# Data Drift Impact on Model
# Question: Use a simple linear regression model to demonstrate how data drift affects model predictions.

# 1. Train a model on the original data:
# 2. Evaluate on the drifted data:
# 3. Compare errors:

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import ks_2samp

# 1. Create Original Dataset
np.random.seed(42)
size = 100
X_orig = np.random.normal(loc=50, scale=10, size=size).reshape(-1, 1)
y_orig = 3 * X_orig.flatten() + np.random.normal(loc=0, scale=5, size=size)

# 2. Create Drifted Dataset (shift mean and scale)
X_drifted = np.random.normal(loc=60, scale=15, size=size).reshape(-1, 1)
y_drifted = 3 * X_drifted.flatten() + np.random.normal(loc=0, scale=5, size=size)

# 3. Train Linear Regression on original data
model = LinearRegression()
model.fit(X_orig, y_orig)

# Predict on original and drifted data
y_pred_orig = model.predict(X_orig)
y_pred_drifted = model.predict(X_drifted)

# Calculate errors
mse_orig = mean_squared_error(y_orig, y_pred_orig)
mse_drifted = mean_squared_error(y_drifted, y_pred_drifted)

print(f"Mean Squared Error on Original Data: {mse_orig:.2f}")
print(f"Mean Squared Error on Drifted Data: {mse_drifted:.2f}")

# --- Monitoring Data Distribution Changes ---

def feature_stats(data):
    return {'mean': np.mean(data), 'std': np.std(data)}

orig_stats = feature_stats(X_orig.flatten())
drifted_stats = feature_stats(X_drifted.flatten())

print("\nOriginal Feature Stats:", orig_stats)
print("Drifted Feature Stats:", drifted_stats)

# Set simple threshold for drift detection (example: mean difference > 2 std deviations of original)
mean_diff = abs(orig_stats['mean'] - drifted_stats['mean'])
threshold = 2 * orig_stats['std']

print(f"\nMean difference: {mean_diff:.2f}")
print(f"Threshold for drift detection: {threshold:.2f}")
if mean_diff > threshold:
    print("Significant data drift detected based on mean difference!")
else:
    print("No significant drift detected based on mean difference.")

# --- Basic Data Validation ---

# Create a sample DataFrame to validate
data = {
    'id': [1, 2, 3, 4, 5, 5],   # duplicate ID to check primary key violation
    'value': [10, 20, 30, None, 50, 60]  # None to check missing values
}
df = pd.DataFrame(data)

def validate_data(df):
    errors = []
    # Check for missing values
    if df.isnull().any().any():
        errors.append("Missing values detected.")
    # Check for duplicate primary keys in 'id' column
    if df['id'].duplicated().any():
        errors.append("Duplicate primary keys detected in 'id' column.")
    return errors

validation_errors = validate_data(df)
if validation_errors:
    print("\nData Validation Errors:")
    for err in validation_errors:
        print(f"- {err}")
else:
    print("\nData validation passed with no errors.")

# --- Kolmogorov-Smirnov Test for Data Drift Detection ---

ks_stat, ks_pvalue = ks_2samp(X_orig.flatten(), X_drifted.flatten())

print(f"\nKS Test Statistic: {ks_stat:.4f}")
print(f"KS Test p-value: {ks_pvalue:.4f}")

alpha = 0.05
if ks_pvalue < alpha:
    print("Reject null hypothesis: Significant data drift detected by KS test!")
else:
    print("Fail to reject null hypothesis: No significant data drift detected by KS test.")


Mean Squared Error on Original Data: 22.09
Mean Squared Error on Drifted Data: 22.33

Original Feature Stats: {'mean': 48.96153482605907, 'std': 9.036161766446297}
Drifted Feature Stats: {'mean': 60.97344379650673, 'std': 16.182717693681113}

Mean difference: 12.01
Threshold for drift detection: 18.07
No significant drift detected based on mean difference.

Data Validation Errors:
- Missing values detected.
- Duplicate primary keys detected in 'id' column.

KS Test Statistic: 0.4600
KS Test p-value: 0.0000
Reject null hypothesis: Significant data drift detected by KS test!


In [4]:
# Monitoring Data Distribution Changes
# Question: Use Python to monitor distribution changes in features to detect potential data drift.

# 1. Calculate feature statistics (mean and standard deviation) for both original and drifted data:
# 2. Compare statistics:
# 3. Set thresholds to detect significant drift:




In [5]:
# Automating Data Quality Checks with Python
# Question: Automate a basic data validation process using Python to ensure the dataset's
# structural integrity.

# 1. Define validation checks:
# 2. Apply validation:




In [6]:
# Introducing Great Expectations for Data Validation
# Question: Use Great Expectations to set up data validation checks for a dataset.

# 1. Install Great Expectations:
# 2. Create a new expectations suite:
# 3. Load data and generate expectations:




In [7]:
# Automating Constraint Checks with Python
# Question: Automate primary key and foreign key constraint checks using Python to ensure dataset compliance.


# 1. Assuming datasets exist with primary and foreign key relationships in pandas dataframes employees_df and departments_df :




In [8]:
# Advanced Data Drift Detection using Statistical Tests
# Question: Implement Kolmogorov-Smirnov test using Python to detect data drift at a more sophisticated level.

# 1. Use SciPy to perform KS test:


