# Ensemble Model: C1 + A2

This notebook implements a probability-averaged ensemble of the CatBoost (C1) and Logistic Regression (A2) models.

**Purpose:**  
To explore whether ensembling improves numerical performance, while recognising reduced interpretability compared to single models.


In [3]:
from pathlib import Path
import sys
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from IPython.display import display

from catboost import CatBoostClassifier

In [4]:
# PATHS
NOTEBOOK_DIR = Path.cwd()              
PROJECT_ROOT = NOTEBOOK_DIR.parent     

DATA_DIR = PROJECT_ROOT / "data"
SRC_DIR = PROJECT_ROOT / "src"
RESULTS_DIR = PROJECT_ROOT / "results" / "ENSEMBLE_C1_A2"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

sys.path.insert(0, str(SRC_DIR))

from common_setup import (
    load_arff,
    prepare_df_AQ_DEMO,
    build_preprocessor
)

ADULT_ARFF_PATH = DATA_DIR / "Autism-Adult-Data.arff"
CHILD_ARFF_PATH = DATA_DIR / "Autism-Child-Data.arff"



In [5]:
# LOAD DATA

adult_raw = load_arff(ADULT_ARFF_PATH)
child_raw = load_arff(CHILD_ARFF_PATH)

print("\nLoaded adult shape:", adult_raw.shape)
print("Loaded child shape:", child_raw.shape)


Loaded adult shape: (704, 21)
Loaded child shape: (292, 21)


In [6]:
# PREP FEATURES (AQ + DEMO)

Xa, ya, num_adult, cat_adult, feat_adult = prepare_df_AQ_DEMO(adult_raw, "Adult")
Xc, yc, num_child, cat_child, feat_child = prepare_df_AQ_DEMO(child_raw, "Child")

print("\nMajority-class baseline accuracy:")
print("  Adult:", max(np.mean(ya == 0), np.mean(ya == 1)))
print("  Child:", max(np.mean(yc == 0), np.mean(yc == 1)))

# CatBoost categorical feature indices (based on X column order)
cat_idx_adult = [Xa.columns.get_loc(c) for c in cat_adult]
cat_idx_child = [Xc.columns.get_loc(c) for c in cat_child]

print("\nAdult CatBoost cat idx:", cat_idx_adult)
print("Child CatBoost cat idx:", cat_idx_child)


[Adult] AQ columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
[Adult] Dropping leaky column 'result'
[Adult] Using label column: 'Class/ASD'
[Adult] X shape: (704, 19)
[Adult] y counts: [515 189]

[Child] AQ columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
[Child] Dropping leaky column 'result'
[Child] Using label column: 'Class/ASD'
[Child] X shape: (292, 19)
[Child] y counts: [151 141]

Majority-class baseline accuracy:
  Adult: 0.7315340909090909
  Child: 0.5171232876712328

Adult CatBoost cat idx: [13, 14, 15, 16, 17, 18]
Child CatBoost cat idx: [13, 14, 15, 16, 17, 18]


In [7]:
# DEFINE MODELS (A2 and C1)


# A2: Logistic Regression pipeline (OneHot+Scaler from common_setup)
pre_adult = build_preprocessor(num_adult, cat_adult)
pre_child = build_preprocessor(num_child, cat_child)

a2_adult_pipe = Pipeline([
    ("prep", pre_adult),
    ("clf", LogisticRegression(max_iter=3000, solver="lbfgs"))
])

a2_child_pipe = Pipeline([
    ("prep", pre_child),
    ("clf", LogisticRegression(max_iter=3000, solver="lbfgs"))
])

# C1: CatBoost (fixed hyperparameters)
C1_PARAMS = dict(
    loss_function="Logloss",
    depth=5,
    learning_rate=0.1,
    iterations=400,
    random_seed=42,
    verbose=False
)

In [8]:
# CV RUNNER FOR ENSEMBLE

def run_ensemble_cv(X, y, lr_pipe, cat_idx, dataset_label, n_splits=5, random_state=42):
    
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    fold_rows = []
    metrics = {k: [] for k in ["accuracy", "precision", "recall", "f1", "roc_auc", "pr_auc"]}

    for fold_idx, (tr, te) in enumerate(cv.split(X, y), 1):
        Xtr, Xte = X.iloc[tr].copy(), X.iloc[te].copy()
        ytr, yte = y[tr], y[te]

        # 1) Train A2 (LR pipeline)
        lr_pipe.fit(Xtr, ytr)
        lr_proba = lr_pipe.predict_proba(Xte)[:, 1]

        # 2) Train C1 (CatBoost)
        cb = CatBoostClassifier(**C1_PARAMS)
        cb.fit(Xtr, ytr, cat_features=cat_idx)
        cb_proba = cb.predict_proba(Xte)[:, 1]

        # 3) Soft voting ensemble
        ens_proba = (lr_proba + cb_proba) / 2.0
        ens_pred = (ens_proba >= 0.5).astype(int)

        acc = accuracy_score(yte, ens_pred)
        prec = precision_score(yte, ens_pred, zero_division=0)
        rec = recall_score(yte, ens_pred, zero_division=0)
        f1 = f1_score(yte, ens_pred, zero_division=0)
        roc = roc_auc_score(yte, ens_proba)
        pr = average_precision_score(yte, ens_proba)

        for k, v in zip(metrics.keys(), [acc, prec, rec, f1, roc, pr]):
            metrics[k].append(v)

        fold_rows.append({
            "fold": fold_idx,
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1": f1,
            "roc_auc": roc,
            "pr_auc": pr
        })

        print(
            f"[Ensemble C1+A2 {dataset_label}] Fold {fold_idx}: "
            f"acc={acc:.4f}, prec={prec:.4f}, rec={rec:.4f}, f1={f1:.4f}, "
            f"roc_auc={roc:.4f}, pr_auc={pr:.4f}"
        )

    summary = {k: float(np.mean(v)) for k, v in metrics.items()}
    print(f"\n[Ensemble C1+A2 {dataset_label}] {n_splits}-fold mean metrics:")
    for k, v in summary.items():
        print(f"  {k}: {v:.4f}")

    folds_df = pd.DataFrame(fold_rows)
    summary_df = pd.DataFrame([summary])
    return folds_df, summary_df


In [9]:
# RUN ENSEMBLE CV (Adult + Child)

adult_folds, adult_summary = run_ensemble_cv(
    Xa, ya, a2_adult_pipe, cat_idx_adult, dataset_label="Adult",
    n_splits=5, random_state=42
)

child_folds, child_summary = run_ensemble_cv(
    Xc, yc, a2_child_pipe, cat_idx_child, dataset_label="Child",
    n_splits=5, random_state=42
)

[Ensemble C1+A2 Adult] Fold 1: acc=0.9929, prec=0.9744, rec=1.0000, f1=0.9870, roc_auc=1.0000, pr_auc=1.0000
[Ensemble C1+A2 Adult] Fold 2: acc=1.0000, prec=1.0000, rec=1.0000, f1=1.0000, roc_auc=1.0000, pr_auc=1.0000
[Ensemble C1+A2 Adult] Fold 3: acc=0.9858, prec=0.9500, rec=1.0000, f1=0.9744, roc_auc=0.9997, pr_auc=0.9993
[Ensemble C1+A2 Adult] Fold 4: acc=0.9929, prec=1.0000, rec=0.9737, f1=0.9867, roc_auc=1.0000, pr_auc=1.0000
[Ensemble C1+A2 Adult] Fold 5: acc=1.0000, prec=1.0000, rec=1.0000, f1=1.0000, roc_auc=1.0000, pr_auc=1.0000

[Ensemble C1+A2 Adult] 5-fold mean metrics:
  accuracy: 0.9943
  precision: 0.9849
  recall: 0.9947
  f1: 0.9896
  roc_auc: 0.9999
  pr_auc: 0.9999
[Ensemble C1+A2 Child] Fold 1: acc=0.9492, prec=0.9032, rec=1.0000, f1=0.9492, roc_auc=0.9965, pr_auc=0.9965
[Ensemble C1+A2 Child] Fold 2: acc=1.0000, prec=1.0000, rec=1.0000, f1=1.0000, roc_auc=1.0000, pr_auc=1.0000
[Ensemble C1+A2 Child] Fold 3: acc=1.0000, prec=1.0000, rec=1.0000, f1=1.0000, roc_auc=1

In [10]:
# SAVE OUTPUTS

adult_folds.to_csv(RESULTS_DIR / "Ensemble_C1_A2_adult_folds.csv", index=False)
adult_summary.to_csv(RESULTS_DIR / "Ensemble_C1_A2_adult_summary.csv", index=False)

child_folds.to_csv(RESULTS_DIR / "Ensemble_C1_A2_child_folds.csv", index=False)
child_summary.to_csv(RESULTS_DIR / "Ensemble_C1_A2_child_summary.csv", index=False)

print("\nSaved ENSEMBLE results to:", RESULTS_DIR)

print("\nAdult ensemble summary:")
display(adult_summary)

print("\nChild ensemble summary:")
display(child_summary)


Saved ENSEMBLE results to: C:\Users\14ush\Desktop\asc-screening-xai-dissertation\results\ENSEMBLE_C1_A2

Adult ensemble summary:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.994326,0.984872,0.994737,0.989608,0.999949,0.999865



Child ensemble summary:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.986382,0.973749,1.0,0.986322,0.999309,0.999309


In [11]:
# SHUFFLED-LABEL SANITY CHECK (save folds + summary)

def shuffled_label_ensemble(X, y, lr_pipe, cat_idx, dataset_label, n_splits=5, random_state=42):
    y_shuf = shuffle(y, random_state=random_state)
    folds_df, summary_df = run_ensemble_cv(
        X, y_shuf, lr_pipe, cat_idx,
        dataset_label=f"{dataset_label} (SHUFFLED LABELS)",
        n_splits=n_splits, random_state=random_state
    )
    return folds_df, summary_df

adult_shuf_folds, adult_shuf_summary = shuffled_label_ensemble(
    Xa, ya, a2_adult_pipe, cat_idx_adult, "Adult", n_splits=5, random_state=42
)

child_shuf_folds, child_shuf_summary = shuffled_label_ensemble(
    Xc, yc, a2_child_pipe, cat_idx_child, "Child", n_splits=5, random_state=42
)

adult_shuf_folds.to_csv(RESULTS_DIR / "Ensemble_C1_A2_adult_shuffled_folds.csv", index=False)
adult_shuf_summary.to_csv(RESULTS_DIR / "Ensemble_C1_A2_adult_shuffled_summary.csv", index=False)

child_shuf_folds.to_csv(RESULTS_DIR / "Ensemble_C1_A2_child_shuffled_folds.csv", index=False)
child_shuf_summary.to_csv(RESULTS_DIR / "Ensemble_C1_A2_child_shuffled_summary.csv", index=False)

print("\nSaved shuffled-label folds + summaries to:", RESULTS_DIR)

print("\nAdult shuffled-label summary:")
display(adult_shuf_summary)


print("\nChild shuffled-label summary:")
display(child_shuf_summary)


[Ensemble C1+A2 Adult (SHUFFLED LABELS)] Fold 1: acc=0.7021, prec=0.1667, rec=0.0263, f1=0.0455, roc_auc=0.5243, pr_auc=0.2834
[Ensemble C1+A2 Adult (SHUFFLED LABELS)] Fold 2: acc=0.7305, prec=0.5000, rec=0.0789, f1=0.1364, roc_auc=0.5588, pr_auc=0.3558
[Ensemble C1+A2 Adult (SHUFFLED LABELS)] Fold 3: acc=0.7234, prec=0.0000, rec=0.0000, f1=0.0000, roc_auc=0.5194, pr_auc=0.2882
[Ensemble C1+A2 Adult (SHUFFLED LABELS)] Fold 4: acc=0.7234, prec=0.0000, rec=0.0000, f1=0.0000, roc_auc=0.5120, pr_auc=0.2994
[Ensemble C1+A2 Adult (SHUFFLED LABELS)] Fold 5: acc=0.7143, prec=0.2857, rec=0.0541, f1=0.0909, roc_auc=0.5117, pr_auc=0.3046

[Ensemble C1+A2 Adult (SHUFFLED LABELS)] 5-fold mean metrics:
  accuracy: 0.7187
  precision: 0.1905
  recall: 0.0319
  f1: 0.0545
  roc_auc: 0.5252
  pr_auc: 0.3063
[Ensemble C1+A2 Child (SHUFFLED LABELS)] Fold 1: acc=0.5254, prec=0.5000, rec=0.4286, f1=0.4615, roc_auc=0.4919, pr_auc=0.5078
[Ensemble C1+A2 Child (SHUFFLED LABELS)] Fold 2: acc=0.4915, prec=0.485

Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.718744,0.190476,0.031863,0.054545,0.525228,0.3063



Child shuffled-label summary:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.499942,0.478541,0.48867,0.480951,0.500715,0.505055
