# DDS8555 A6 — CV Table & Calibration Helper

**Purpose.** Generate the exact numbers for the one‑pager results table (CV Accuracy ± SD, Macro‑F1 ± SD, PR‑AUC macro, OOB for Bagging, and Brier before→after for calibration).

You can use **Option A** to recompute from your prepared `X, y`, or **Option B** to load precomputed CV means/SDs from `model_cv_summary.csv`.

In [None]:
# Imports
import numpy as np, pandas as pd, os, json
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import label_binarize

# 5-fold CV with fixed seed
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

def cv_acc_f1(model, X, y):
    acc = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
    f1  = cross_val_score(model, X, y, cv=cv, scoring="f1_macro")
    return acc.mean(), acc.std(), f1.mean(), f1.std()

def cv_prauc_macro(model, X, y):
    prob = cross_val_predict(model, X, y, cv=cv, method="predict_proba")
    classes = np.unique(y)
    Y = label_binarize(y, classes=classes)
    return average_precision_score(Y, prob, average="macro")


## Option A — Recompute from your prepared `X, y`
Make sure `X` and `y` are defined in memory (same preprocessing as you used for Kaggle). Then run the next cell.

In [None]:
# Define models (match your Kaggle settings)
dt = DecisionTreeClassifier(criterion="gini", min_samples_leaf=3, random_state=123)
bg = BaggingClassifier(estimator=DecisionTreeClassifier(min_samples_leaf=3, random_state=123),
                       n_estimators=60, bootstrap=True, oob_score=True, random_state=123)
rf = RandomForestClassifier(n_estimators=120, max_features="sqrt", min_samples_leaf=2,
                            random_state=123, n_jobs=-1)
gb = GradientBoostingClassifier(n_estimators=80, learning_rate=0.05, max_depth=2, random_state=123)

models = {'Decision Tree (leaf=3)': dt, 'Bagging (60 trees)': bg,
          'Random Forest (120, sqrt)': rf, 'Gradient Boosting (depth=2, lr=0.05)': gb}

results = {}
try:
    _ = X, y  # check existence
    for name, mdl in models.items():
        acc_m, acc_sd, f1_m, f1_sd = cv_acc_f1(mdl, X, y)
        pr_auc = cv_prauc_macro(mdl, X, y)
        results[name] = {'cv_acc_mean':acc_m, 'cv_acc_sd':acc_sd,
                         'cv_f1_mean':f1_m, 'cv_f1_sd':f1_sd, 'pr_auc':pr_auc}
    # OOB for Bagging
    bg.fit(X, y)
    results['Bagging (60 trees)']['oob'] = float(bg.oob_score_)
    # Brier before/after for RF
    rf.fit(X, y)
    proba_raw = rf.predict_proba(X)
    brier_before = brier_score_loss(label_binarize(y, classes=np.unique(y)).ravel(), proba_raw.ravel())
    cal = CalibratedClassifierCV(rf, method='isotonic', cv=5)
    cal.fit(X, y)
    proba_cal = cal.predict_proba(X)
    brier_after = brier_score_loss(label_binarize(y, classes=np.unique(y)).ravel(), proba_cal.ravel())
    results['__brier__'] = {'before':float(brier_before), 'after':float(brier_after)}
    print('Option A succeeded.')
except NameError:
    print('Option A skipped — X and y not found. Use Option B or define X,y and re-run.')

## Option B — Load from `/mnt/data/model_cv_summary.csv`
If you exported CV means/SDs earlier, place them in this CSV with columns:
`model, accuracy_mean, accuracy_sd, macro_f1_mean, macro_f1_sd`.
This cell will also merge the Kaggle private/public scores and leave PR‑AUC/OOB blank unless you later run Option A.

In [None]:
import pandas as pd, os, json, math

csv_path = '/mnt/data/model_cv_summary.csv'
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    # map names to the display names used in the table
    alias = {'Model1_DecisionTree':'Decision Tree (leaf=3)',
             'Model2_Bagging':'Bagging (60 trees)',
             'Model3_RandomForest':'Random Forest (120, sqrt)',
             'Model4_Boosting':'Gradient Boosting (depth=2, lr=0.05)'}
    df['display'] = df['model'].map(alias).fillna(df['model'])
    for _, r in df.iterrows():
        name = r['display']
        results.setdefault(name, {})
        results[name].update({'cv_acc_mean':float(r['accuracy_mean']),
                              'cv_acc_sd':float(r['accuracy_sd']),
                              'cv_f1_mean':float(r['macro_f1_mean']),
                              'cv_f1_sd':float(r['macro_f1_sd'])})
    print('Loaded CV summary from CSV.')
else:
    print('CSV not found at', csv_path)

# Kaggle scores (fill from your screenshot)
kaggle = {
    'Decision Tree (leaf=3)':  {'private':0.85585, 'public':0.86669},
    'Bagging (60 trees)':      {'private':0.89333, 'public':0.89306},
    'Random Forest (120, sqrt)': {'private':0.89721, 'public':0.89270},
    'Gradient Boosting (depth=2, lr=0.05)': {'private':0.86750, 'public':0.86452},
}
for k,v in kaggle.items():
    results.setdefault(k, {}).update({'k_private':v['private'], 'k_public':v['public']})

# Persist intermediate results
out_json = '/mnt/data/results_values.json'
with open(out_json, 'w') as f:
    json.dump(results, f, indent=2)
print('Saved intermediate metrics →', out_json)

## Build the one‑pager table text
This will round to **3 decimals** and write a Markdown table you can paste into Word.

In [None]:
import json, numpy as np, os

order = ['Decision Tree (leaf=3)', 'Bagging (60 trees)',
         'Random Forest (120, sqrt)', 'Gradient Boosting (depth=2, lr=0.05)']

def fmt(x):
    return '—' if x is None or (isinstance(x,float) and (np.isnan(x))) else f"{x:.3f}"

with open('/mnt/data/results_values.json') as f:
    R = json.load(f)

lines = []
lines.append('| Model | CV Accuracy (±SD) | Macro-F1 (±SD) | PR-AUC (macro) | OOB Accuracy | Kaggle Private | Kaggle Public |')
lines.append('| --- | --- | --- | --- | --- | --- | --- |')

for name in order:
    r = R.get(name, {})
    acc = fmt(r.get('cv_acc_mean')) + ' ± ' + fmt(r.get('cv_acc_sd'))
    f1  = fmt(r.get('cv_f1_mean'))  + ' ± ' + fmt(r.get('cv_f1_sd'))
    pr  = fmt(r.get('pr_auc')) if r.get('pr_auc') is not None else '—'
    oob = fmt(r.get('oob')) if r.get('oob') is not None else '—'
    kp  = fmt(r.get('k_private'))
    ku  = fmt(r.get('k_public'))
    lines.append(f"| {name} | {acc} | {f1} | {pr} | {oob} | {kp} | {ku} |" )

table_md = '\n'.join(lines)
md_path = '/mnt/data/onepager_table_filled.md'
with open(md_path, 'w') as f:
    f.write(table_md)

# Brier before→after line (if available)
brier = R.get('__brier__', {})
brier_text = (f"Brier (before → after): {fmt(brier.get('before'))} → {fmt(brier.get('after'))}"
              if brier else 'Brier (before → after): — → —')
brier_path = '/mnt/data/brier_line.txt'
with open(brier_path, 'w') as f:
    f.write(brier_text)

print('Wrote:')
print(' -', md_path)
print(' -', brier_path)
print('\nPreview:')
print(table_md)
print('\n' + brier_text)