In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp

# Set a seed for reproducibility
np.random.seed(42)

# 1. Generate Historical Training Data
n_samples = 100
X_historical = np.random.rand(n_samples, 1) * 10
y_historical = 2 + 3 * X_historical + np.random.randn(n_samples, 1) * 2

historical_df = pd.DataFrame({'feature': X_historical.flatten(), 'target': y_historical.flatten()})

# 2. Generate Recent Unseen Data with Drift
# Introduce drift in the feature distribution (shifted mean)
X_recent_drift_feature = np.random.rand(n_samples, 1) * 10 + 2
y_recent_drift_feature = 2 + 3 * X_recent_drift_feature + np.random.randn(n_samples, 1) * 2

recent_df_drift_feature = pd.DataFrame({'feature': X_recent_drift_feature.flatten(), 'target': y_recent_drift_feature.flatten(), 'data_source': 'recent_feature_drift'})

# Introduce drift in the relationship (different coefficient and intercept)
X_recent_drift_relation = np.random.rand(n_samples, 1) * 10
y_recent_drift_relation = 5 + 1.5 * X_recent_drift_relation + np.random.randn(n_samples, 1) * 3

recent_df_drift_relation = pd.DataFrame({'feature': X_recent_drift_relation.flatten(), 'target': y_recent_drift_relation.flatten(), 'data_source': 'recent_relation_drift'})

# Combine historical and recent data for comparison
historical_df['data_source'] = 'historical'
combined_df = pd.concat([historical_df, recent_df_drift_feature, recent_df_drift_relation], ignore_index=True)

print("Combined DataFrame with Historical and Recent Data:")
print(combined_df.head())

Combined DataFrame with Historical and Recent Data:
    feature     target data_source
0  3.745401  13.410298  historical
1  9.507143  29.923414  historical
2  7.319939  24.143340  historical
3  5.986585  15.984617  historical
4  1.560186   6.241215  historical


In [2]:
import pandas as pd

def check_column_presence(df, columns):
    """Checks if specified columns exist in the DataFrame."""
    missing_columns = [col for col in columns if col not in df.columns]
    if missing_columns:
        print(f"Validation Error: Missing columns - {missing_columns}")
        return False
    else:
        print("Validation Passed: All required columns are present.")
        return True

def check_column_dtype(df, column_dtype_map):
    """Checks if columns have the expected data types."""
    dtype_errors = {}
    for col, expected_dtype in column_dtype_map.items():
        if col in df.columns:
            actual_dtype = df[col].dtype
            if not pd.api.types.is_dtype_equal(actual_dtype, expected_dtype):
                dtype_errors[col] = f"Expected {expected_dtype}, got {actual_dtype}"
        else:
            dtype_errors[col] = "Column not found"

    if dtype_errors:
        print("Validation Error: Incorrect data types -")
        for col, error in dtype_errors.items():
            print(f"  - Column '{col}': {error}")
        return False
    else:
        print("Validation Passed: All columns have the expected data types.")
        return True

def check_column_values_non_null(df, columns):
    """Checks if specified columns have any null values."""
    null_errors = {}
    for col in columns:
        if col in df.columns:
            if df[col].isnull().any():
                null_errors[col] = f"Null values found"
        else:
            null_errors[col] = "Column not found"

    if null_errors:
        print("Validation Error: Null values found -")
        for col, error in null_errors.items():
            print(f"  - Column '{col}': {error}")
        return False
    else:
        print("Validation Passed: No null values found in the specified columns.")
        return True

# Sample Data
data = {'id': [1, 2, 3, None], 'name': ['A', 'B', 'C', 'D'], 'value': [10.5, 20.0, 30.0, 'error']}
df = pd.DataFrame(data)

# Define validation rules
required_columns = ['id', 'name', 'value']
expected_dtypes = {'id': 'Int64', 'name': 'object', 'value': 'float64'}
not_null_columns = ['id', 'name']

# Run validations
check_column_presence(df, required_columns)
check_column_dtype(df, expected_dtypes)
check_column_values_non_null(df, not_null_columns)

Validation Passed: All required columns are present.
Validation Error: Incorrect data types -
  - Column 'id': Expected Int64, got float64
  - Column 'value': Expected float64, got object
Validation Error: Null values found -
  - Column 'id': Null values found


False