# C1: CatBoost (Final Model)

This notebook implements the final selected CatBoost model using AQ-10 and demographic features.

**Purpose:**  
To develop a robust, high-performing, and explainable screening model suitable for SHAP-based interpretation.


In [13]:
from pathlib import Path
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)
from sklearn.utils import shuffle
from IPython.display import display

from catboost import CatBoostClassifier

In [15]:
# PATHS
NOTEBOOK_DIR = Path.cwd()                 
PROJECT_ROOT = NOTEBOOK_DIR.parent    

DATA_DIR = PROJECT_ROOT / "data"
SRC_DIR = PROJECT_ROOT / "src"
RESULTS_DIR = PROJECT_ROOT / "results" / "C1"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

sys.path.insert(0, str(SRC_DIR))

from common_setup import load_arff, prepare_df_AQ_DEMO

ADULT_ARFF_PATH = DATA_DIR / "Autism-Adult-Data.arff"
CHILD_ARFF_PATH = DATA_DIR / "Autism-Child-Data.arff"

In [17]:
adult_raw = load_arff(ADULT_ARFF_PATH)
child_raw = load_arff(CHILD_ARFF_PATH)

print("\nLoaded adult shape:", adult_raw.shape)
print("Loaded child shape:", child_raw.shape)

Xa, ya, num_adult, cat_adult, feat_adult = prepare_df_AQ_DEMO(adult_raw, "Adult")
Xc, yc, num_child, cat_child, feat_child = prepare_df_AQ_DEMO(child_raw, "Child")

print("\nAdult cat cols:", cat_adult)
print("Child cat cols:", cat_child)



Loaded adult shape: (704, 21)
Loaded child shape: (292, 21)

[Adult] AQ columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
[Adult] Dropping leaky column 'result'
[Adult] Using label column: 'Class/ASD'
[Adult] X shape: (704, 19)
[Adult] y counts: [515 189]

[Child] AQ columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
[Child] Dropping leaky column 'result'
[Child] Using label column: 'Class/ASD'
[Child] X shape: (292, 19)
[Child] y counts: [151 141]

Adult cat cols: ['gender', 'ethnicity', 'contry_of_res', 'used_app_before', 'age_desc', 'relation']
Child cat cols: ['gender', 'ethnicity', 'contry_of_res', 'used_app_before', 'age_desc', 'relation']


In [19]:
def run_catboost_cv(X, y, cat_cols, exp_name, n_splits=5, random_state=42):
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # CatBoost needs categorical feature indices (positions in X)
    cat_idx = [X.columns.get_loc(c) for c in cat_cols]
    print(f"\n[{exp_name}] CatBoost categorical indices:", cat_idx)

    fold_rows = []
    metrics = {k: [] for k in ["accuracy","precision","recall","f1","roc_auc","pr_auc"]}

    for fold_idx, (tr, te) in enumerate(cv.split(X, y), 1):
        Xtr, Xte = X.iloc[tr].copy(), X.iloc[te].copy()
        ytr, yte = y[tr], y[te]

        model = CatBoostClassifier(
            loss_function="Logloss",
            depth=5,
            learning_rate=0.1,
            iterations=400,
            random_seed=random_state,
            verbose=False
        )

        # NOTE: Keep CV “clean” by NOT using eval_set on the test fold
        model.fit(Xtr, ytr, cat_features=cat_idx, verbose=False)

        y_pred = model.predict(Xte).astype(int).ravel()
        y_proba = model.predict_proba(Xte)[:, 1]

        acc = accuracy_score(yte, y_pred)
        prec = precision_score(yte, y_pred, zero_division=0)
        rec = recall_score(yte, y_pred, zero_division=0)
        f1 = f1_score(yte, y_pred, zero_division=0)
        roc = roc_auc_score(yte, y_proba)
        pr = average_precision_score(yte, y_proba)

        for k, v in zip(["accuracy","precision","recall","f1","roc_auc","pr_auc"], [acc,prec,rec,f1,roc,pr]):
            metrics[k].append(v)

        fold_rows.append({
            "fold": fold_idx,
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1": f1,
            "roc_auc": roc,
            "pr_auc": pr
        })

        print(f"[{exp_name}] Fold {fold_idx}: "
              f"acc={acc:.4f}, prec={prec:.4f}, rec={rec:.4f}, "
              f"f1={f1:.4f}, roc_auc={roc:.4f}, pr_auc={pr:.4f}")

    summary = {m: float(np.mean(v)) for m, v in metrics.items()}

    print(f"\n[{exp_name}] {n_splits}-fold mean metrics:")
    for k, v in summary.items():
        print(f"  {k}: {v:.4f}")

    folds_df = pd.DataFrame(fold_rows)
    summary_df = pd.DataFrame([summary])
    return folds_df, summary_df


In [21]:
adult_folds, adult_summary = run_catboost_cv(
    Xa, ya, cat_adult,
    exp_name="C1 Adult (CatBoost AQ+Demo)",
    n_splits=5,
    random_state=42
)

child_folds, child_summary = run_catboost_cv(
    Xc, yc, cat_child,
    exp_name="C1 Child (CatBoost AQ+Demo)",
    n_splits=5,
    random_state=42
)

adult_folds.to_csv(RESULTS_DIR / "C1_adult_folds.csv", index=False)
adult_summary.to_csv(RESULTS_DIR / "C1_adult_summary.csv", index=False)
child_folds.to_csv(RESULTS_DIR / "C1_child_folds.csv", index=False)
child_summary.to_csv(RESULTS_DIR / "C1_child_summary.csv", index=False)

print("\nSaved C1 results to:", RESULTS_DIR)
display(adult_summary)
display(child_summary)



[C1 Adult (CatBoost AQ+Demo)] CatBoost categorical indices: [13, 14, 15, 16, 17, 18]
[C1 Adult (CatBoost AQ+Demo)] Fold 1: acc=0.9929, prec=0.9744, rec=1.0000, f1=0.9870, roc_auc=0.9995, pr_auc=0.9986
[C1 Adult (CatBoost AQ+Demo)] Fold 2: acc=0.9787, prec=1.0000, rec=0.9211, f1=0.9589, roc_auc=1.0000, pr_auc=1.0000
[C1 Adult (CatBoost AQ+Demo)] Fold 3: acc=0.9574, prec=0.8810, rec=0.9737, f1=0.9250, roc_auc=0.9969, pr_auc=0.9917
[C1 Adult (CatBoost AQ+Demo)] Fold 4: acc=0.9645, prec=0.9459, rec=0.9211, f1=0.9333, roc_auc=0.9985, pr_auc=0.9959
[C1 Adult (CatBoost AQ+Demo)] Fold 5: acc=0.9929, prec=0.9737, rec=1.0000, f1=0.9867, roc_auc=1.0000, pr_auc=1.0000

[C1 Adult (CatBoost AQ+Demo)] 5-fold mean metrics:
  accuracy: 0.9773
  precision: 0.9550
  recall: 0.9632
  f1: 0.9582
  roc_auc: 0.9990
  pr_auc: 0.9972

[C1 Child (CatBoost AQ+Demo)] CatBoost categorical indices: [13, 14, 15, 16, 17, 18]
[C1 Child (CatBoost AQ+Demo)] Fold 1: acc=0.8644, prec=0.7941, rec=0.9643, f1=0.8710, roc_au

Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.977295,0.954988,0.963158,0.958183,0.998978,0.997248


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.952309,0.938594,0.971675,0.953128,0.996045,0.996068


In [22]:
def shuffled_label_summary(X, y, cat_cols, name, random_state=42):
    y_shuf = shuffle(y, random_state=random_state)
    folds_df, summary_df = run_catboost_cv(
        X, y_shuf, cat_cols,
        exp_name=f"{name} (SHUFFLED LABELS)",
        n_splits=5,
        random_state=random_state
    )
    return folds_df, summary_df

adult_shuf_folds, adult_shuf_summary = shuffled_label_summary(
    Xa, ya, cat_adult, "C1 Adult", random_state=42
)

child_shuf_folds, child_shuf_summary = shuffled_label_summary(
    Xc, yc, cat_child, "C1 Child", random_state=42
)

adult_shuf_folds.to_csv(RESULTS_DIR / "C1_adult_shuffled_folds.csv", index=False)
child_shuf_folds.to_csv(RESULTS_DIR / "C1_child_shuffled_folds.csv", index=False)

adult_shuf_summary.to_csv(RESULTS_DIR / "C1_adult_shuffled_summary.csv", index=False)
child_shuf_summary.to_csv(RESULTS_DIR / "C1_child_shuffled_summary.csv", index=False)

print("\nSaved shuffled-label results to:", RESULTS_DIR)
display(adult_shuf_summary)
display(child_shuf_summary)



[C1 Adult (SHUFFLED LABELS)] CatBoost categorical indices: [13, 14, 15, 16, 17, 18]
[C1 Adult (SHUFFLED LABELS)] Fold 1: acc=0.7021, prec=0.2500, rec=0.0526, f1=0.0870, roc_auc=0.4959, pr_auc=0.2801
[C1 Adult (SHUFFLED LABELS)] Fold 2: acc=0.7021, prec=0.3571, rec=0.1316, f1=0.1923, roc_auc=0.5547, pr_auc=0.3055
[C1 Adult (SHUFFLED LABELS)] Fold 3: acc=0.7021, prec=0.2500, rec=0.0526, f1=0.0870, roc_auc=0.5386, pr_auc=0.2884
[C1 Adult (SHUFFLED LABELS)] Fold 4: acc=0.6879, prec=0.2857, rec=0.1053, f1=0.1538, roc_auc=0.4852, pr_auc=0.3158
[C1 Adult (SHUFFLED LABELS)] Fold 5: acc=0.7143, prec=0.3636, rec=0.1081, f1=0.1667, roc_auc=0.5293, pr_auc=0.3107

[C1 Adult (SHUFFLED LABELS)] 5-fold mean metrics:
  accuracy: 0.7017
  precision: 0.3013
  recall: 0.0900
  f1: 0.1373
  roc_auc: 0.5207
  pr_auc: 0.3001

[C1 Child (SHUFFLED LABELS)] CatBoost categorical indices: [13, 14, 15, 16, 17, 18]
[C1 Child (SHUFFLED LABELS)] Fold 1: acc=0.5085, prec=0.4828, rec=0.5000, f1=0.4912, roc_auc=0.5415,

Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.701722,0.301299,0.090043,0.137347,0.520721,0.300118


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.5173,0.499229,0.48867,0.490807,0.535737,0.537455
