# DDS8555 — Assignment 6: Obesity Risk — Four Tree Models

This notebook trains 4 models and writes Kaggle-ready CSVs to `../submissions/`.

In [None]:

import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

RANDOM_STATE = 123
DATA_DIR = Path('../data/raw')
SUBMIT_DIR = Path('../submissions'); SUBMIT_DIR.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')
sample = pd.read_csv(DATA_DIR / 'sample_submission.csv')

ID_COL, TARGET_COL = sample.columns[0], sample.columns[1]

X = train.drop(columns=[TARGET_COL, ID_COL], errors='ignore').copy()
y = train[TARGET_COL].astype(str)
X_test = test.drop(columns=[ID_COL], errors='ignore').copy()

# Deterministic categorical encoding using combined levels
for col in X.columns:
    if X[col].dtype == 'object' or str(X[col].dtype).startswith('category'):
        combined = pd.concat([X[col], X_test[col]], axis=0).astype('category')
        X[col] = pd.Categorical(X[col], categories=combined.cat.categories).codes
        X_test[col] = pd.Categorical(X_test[col], categories=combined.cat.categories).codes

# Median impute
for df in [X, X_test]:
    for c in X.columns:
        if df[c].isna().any():
            df[c].fillna(df[c].median(), inplace=True)

models = {
    'Model1_DecisionTree': DecisionTreeClassifier(criterion='gini', min_samples_leaf=3, random_state=RANDOM_STATE),
    'Model2_Bagging': BaggingClassifier(
        base_estimator=DecisionTreeClassifier(min_samples_leaf=3, random_state=RANDOM_STATE),
        n_estimators=60, bootstrap=True, random_state=RANDOM_STATE),
    'Model3_RandomForest': RandomForestClassifier(n_estimators=120, max_features='sqrt', min_samples_leaf=2, random_state=RANDOM_STATE),
    'Model4_Boosting': GradientBoostingClassifier(n_estimators=80, learning_rate=0.05, max_depth=2, random_state=RANDOM_STATE),
}

scoring = {'accuracy': make_scorer(accuracy_score), 'macro_f1': make_scorer(f1_score, average='macro')}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

cv_report = {}

for name, model in models.items():
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)
    cv_report[name] = {k: float(np.mean(v)) for k, v in {
        'accuracy_mean': scores['test_accuracy'],
        'macro_f1_mean': scores['test_macro_f1'],
    }.items()}
    model.fit(X, y)
    preds = pd.Series(model.predict(X_test)).astype(str)
    sub = sample.copy()
    if ID_COL in test.columns:
        sub = sub.drop(columns=[TARGET_COL], errors='ignore').merge(
            pd.DataFrame({ID_COL: test[ID_COL], TARGET_COL: preds}), on=ID_COL, how='left')
    else:
        sub[TARGET_COL] = preds.values[: len(sub)]
    sub.to_csv(SUBMIT_DIR / f"{name}.csv", index=False)

pd.DataFrame(cv_report).T
