In [2]:
# Question: Detecting Data Drift
# Description: Identify potential data drift between two time periods for a numeric attribute.
from scipy.stats import ks_2samp
import pandas as pd

def detect_data_drift(data1, data2, alpha=0.05):
    """
    Detect data drift between two numeric datasets using the KS test.

    Parameters:
    - data1: pd.Series – baseline data
    - data2: pd.Series – current data
    - alpha: significance level (default: 0.05)

    Returns:
    - drift_detected: bool – True if drift is detected
    - p_value: float – p-value of the test
    - statistic: float – KS statistic
    """

    # Drop NaNs for accurate comparison
    data1 = data1.dropna()
    data2 = data2.dropna()

    statistic, p_value = ks_2samp(data1, data2)

    drift_detected = p_value < alpha

    return drift_detected, round(p_value, 4), round(statistic, 4)
