# Week 7 — Supervised Machine Learning

**Goals**
- Train/test split, cross-validation
- Regression and classification models
- Evaluate with R²/RMSE (regression) and ROC/F1 (classification)

## 0) Setup

In [None]:
# !pip -q install pandas numpy scikit-learn matplotlib seaborn
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, classification_report, roc_auc_score, RocCurveDisplay
rng = np.random.default_rng(0)

## 1) Regression example (synthetic housing)

In [None]:
n = 300
X = pd.DataFrame({
    'rooms': rng.integers(2, 7, size=n),
    'size_sqft': rng.normal(1200, 350, size=n).clip(400, 3000),
    'age': rng.integers(0, 50, size=n)
})
y = 50000 + 15000*X['rooms'] + 120*X['size_sqft'] - 500*X['age'] + rng.normal(0, 20000, size=n)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression().fit(X_train, y_train)
pred = model.predict(X_test)
r2 = r2_score(y_test, pred)
# RMSE patch for older scikit-learn versions (no 'squared' kwarg)
rmse = mean_squared_error(y_test, pred) ** 0.5
r2, rmse

## 2) Classification example (synthetic fraud)

In [None]:
n = 2000
Xc = pd.DataFrame({
    'amount': rng.normal(70, 30, size=n).clip(1, 400),
    'online': rng.integers(0,2,size=n),
    'hour': rng.integers(0,24,size=n)
})
# rule: high amount at night online more likely fraud
y = ((Xc['amount']>120) & (Xc['hour'].between(0,5)) & (Xc['online']==1)).astype(int)

Xtr, Xte, ytr, yte = train_test_split(Xc, y, test_size=0.3, random_state=42)
scaler = StandardScaler(with_mean=False)
Xtr_s, Xte_s = scaler.fit_transform(Xtr), scaler.transform(Xte)

logreg = LogisticRegression(max_iter=200).fit(Xtr_s, ytr)
rf = RandomForestClassifier(n_estimators=200, random_state=42).fit(Xtr, ytr)

proba_lr = logreg.predict_proba(Xte_s)[:,1]
proba_rf = rf.predict_proba(Xte)[:,1]

auc_lr = roc_auc_score(yte, proba_lr)
auc_rf = roc_auc_score(yte, proba_rf)
auc_lr, auc_rf

### Classification report & ROC (RandomForest)

In [None]:
pred_rf = (proba_rf > 0.5).astype(int)
print(classification_report(yte, pred_rf))
RocCurveDisplay.from_predictions(yte, proba_rf); plt.show()

## 3) Mini-project tasks

- Try StratifiedKFold cross-validation and compare models.

- Handle class imbalance via threshold tuning; report Precision/Recall/F1.

- Engineer a new feature (e.g., `is_night = hour in [0..5]`) and evaluate impact.