In [1]:
#!/usr/bin/env python3
"""
ML Bytes: Data Leakage (Simple Example)

Story:
Student accidentally sees exam answers while studying.

Goal:
Show how data leakage can produce unrealistically high accuracy.

Run:
  pip install scikit-learn numpy
  python ml_bytes_data_leakage.py
"""

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

print("\nML Bytes — Data Leakage Demo\n")

# ----------------------------
# 1) Create a simple dataset
# ----------------------------
# Feature 1: study_hours (legitimate feature)
# Feature 2: leaked_feature (contains the answer!)
rng = np.random.default_rng(7)

n = 200
study_hours = rng.normal(5, 2, n)

# True rule: pass if study_hours > 5
y = (study_hours > 5).astype(int)

# LEAKAGE: this feature is derived directly from the target
leaked_feature = y + rng.normal(0, 0.01, n)

X_clean = study_hours.reshape(-1, 1)
X_leaky = np.column_stack([study_hours, leaked_feature])

# ----------------------------
# 2) Train/Test split
# ----------------------------
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_clean, y, test_size=0.3, random_state=7
)

Xl_train, Xl_test, yl_train, yl_test = train_test_split(
    X_leaky, y, test_size=0.3, random_state=7
)

# ----------------------------
# 3) Train models
# ----------------------------
model_clean = LogisticRegression()
model_leaky = LogisticRegression()

model_clean.fit(Xc_train, yc_train)
model_leaky.fit(Xl_train, yl_train)

# ----------------------------
# 4) Evaluate
# ----------------------------
acc_clean = accuracy_score(yc_test, model_clean.predict(Xc_test))
acc_leaky = accuracy_score(yl_test, model_leaky.predict(Xl_test))

print("Results:")
print(f"Accuracy WITHOUT leakage : {acc_clean:.2f}")
print(f"Accuracy WITH leakage    : {acc_leaky:.2f}")

# ----------------------------
# 5) Interpretation
# ----------------------------
print("\nInterpretation:")
print("- The clean model learns from study_hours only.")
print("- The leaky model indirectly sees the answer.")
print("- Very high accuracy here is fake and misleading.\n")



ML Bytes — Data Leakage Demo

Results:
Accuracy WITHOUT leakage : 0.98
Accuracy WITH leakage    : 1.00

Interpretation:
- The clean model learns from study_hours only.
- The leaky model indirectly sees the answer.
- Very high accuracy here is fake and misleading.

