In [5]:
# Question: Detecting Data Drift
# Description: Identify potential data drift between two time periods for a numeric attribute.


In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp

# Load datasets for two time periods
df_old = pd.read_csv('data_old.csv')
df_new = pd.read_csv('data_new.csv')

# Column to check for drift
col = 'Age'

# Convert to numeric if needed
df_old[col] = pd.to_numeric(df_old[col], errors='coerce')
df_new[col] = pd.to_numeric(df_new[col], errors='coerce')

# Drop NaNs
old_data = df_old[col].dropna()
new_data = df_new[col].dropna()

# Step 1: Print basic stats
print("Old data stats:")
print(old_data.describe())
print("\nNew data stats:")
print(new_data.describe())

# Step 2: Visualize distributions
plt.figure(figsize=(12,5))

plt.subplot(1, 2, 1)
sns.histplot(old_data, color='blue', kde=True, stat='density', label='Old Data')
sns.histplot(new_data, color='red', kde=True, stat='density', label='New Data')
plt.legend()
plt.title(f'Distribution of {col} (Overlaid)')

plt.subplot(1, 2, 2)
sns.boxplot(data=[old_data, new_data], palette=['blue', 'red'])
plt.xticks([0,1], ['Old Data', 'New Data'])
plt.title(f'Boxplot Comparison of {col}')

plt.show()

# Step 3: Statistical test (Kolmogorov-Smirnov test)
ks_stat, p_value = ks_2samp(old_data, new_data)
print(f"KS test statistic: {ks_stat:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("Significant difference detected — potential data drift!")
else:
    print("No significant difference detected — data distribution is similar.")


FileNotFoundError: [Errno 2] No such file or directory: 'data_old.csv'

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp

# Simulate old data with mean 40, std 10
df_old = pd.DataFrame({
    'Age': np.random.normal(40, 10, 1000)
})

# Simulate new data with mean 45, std 10 (shifted distribution)
df_new = pd.DataFrame({
    'Age': np.random.normal(45, 10, 1000)
})

# Detect drift using Kolmogorov-Smirnov test
stat, p_value = ks_2samp(df_old['Age'], df_new['Age'])
print(f"KS statistic: {stat}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("Data drift detected in 'Age' distribution.")
else:
    print("No significant data drift detected.")


In [None]:
import pandas as pd
import numpy as np

# Simulate old data
df_old = pd.DataFrame({
    'Age': np.random.normal(40, 10, 1000)  # mean 40, std 10, 1000 samples
})

# Simulate new data (shifted mean to 45)
df_new = pd.DataFrame({
    'Age': np.random.normal(45, 10, 1000)
})
