# C2: LightGBM (Comparative Boosted Model)

This notebook evaluates a LightGBM model as a comparative gradient boosting approach.

**Purpose:**  
To compare the performance and stability of an alternative boosting framework with the final CatBoost model.


In [1]:
from pathlib import Path
import sys

import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from IPython.display import display

from lightgbm import LGBMClassifier


In [2]:
# PATHS
NOTEBOOK_DIR = Path.cwd()             
PROJECT_ROOT = NOTEBOOK_DIR.parent   

DATA_DIR = PROJECT_ROOT / "data"
SRC_DIR = PROJECT_ROOT / "src"
RESULTS_DIR = PROJECT_ROOT / "results" / "C2"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Ensure we import from /src
sys.path.insert(0, str(SRC_DIR))

from common_setup import (
    load_arff,
    prepare_df_AQ_DEMO,
    build_preprocessor,
    run_experiment
)

ADULT_ARFF_PATH = DATA_DIR / "Autism-Adult-Data.arff"
CHILD_ARFF_PATH = DATA_DIR / "Autism-Child-Data.arff"

In [3]:
# LOAD DATA

adult_raw = load_arff(ADULT_ARFF_PATH)
child_raw = load_arff(CHILD_ARFF_PATH)

print("\nLoaded adult shape:", adult_raw.shape)
print("Loaded child shape:", child_raw.shape)



Loaded adult shape: (704, 21)
Loaded child shape: (292, 21)


In [4]:
# PREP FEATURES (AQ + DEMO)

Xa, ya, num_adult, cat_adult, feat_adult = prepare_df_AQ_DEMO(adult_raw, "Adult")
Xc, yc, num_child, cat_child, feat_child = prepare_df_AQ_DEMO(child_raw, "Child")

# majority baseline
print("\nMajority-class baseline accuracy:")
print("  Adult:", max(np.mean(ya == 0), np.mean(ya == 1)))
print("  Child:", max(np.mean(yc == 0), np.mean(yc == 1)))


[Adult] AQ columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
[Adult] Dropping leaky column 'result'
[Adult] Using label column: 'Class/ASD'
[Adult] X shape: (704, 19)
[Adult] y counts: [515 189]

[Child] AQ columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
[Child] Dropping leaky column 'result'
[Child] Using label column: 'Class/ASD'
[Child] X shape: (292, 19)
[Child] y counts: [151 141]

Majority-class baseline accuracy:
  Adult: 0.7315340909090909
  Child: 0.5171232876712328


In [5]:
# BUILD PIPELINES (One-hot + LightGBM)

pre_adult = build_preprocessor(num_adult, cat_adult)
pre_child = build_preprocessor(num_child, cat_child)

c2_adult_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1 
)

c2_child_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

c2_adult_pipe = Pipeline([
    ("prep", pre_adult),
    ("clf", c2_adult_model)
])

c2_child_pipe = Pipeline([
    ("prep", pre_child),
    ("clf", c2_child_model)
])


In [6]:
# 5-FOLD CV

adult_folds, adult_summary = run_experiment(
    Xa, ya, c2_adult_pipe,
    exp_name="C2 Adult (LightGBM AQ+Demo)",
    n_splits=5,
    random_state=42
)

child_folds, child_summary = run_experiment(
    Xc, yc, c2_child_pipe,
    exp_name="C2 Child (LightGBM AQ+Demo)",
    n_splits=5,
    random_state=42
)

[C2 Adult (LightGBM AQ+Demo)] Fold 1: acc=0.9716, prec=0.9474, rec=0.9474, f1=0.9474, roc_auc=0.9962, pr_auc=0.9900
[C2 Adult (LightGBM AQ+Demo)] Fold 2: acc=0.9362, prec=1.0000, rec=0.7632, f1=0.8657, roc_auc=0.9934, pr_auc=0.9818
[C2 Adult (LightGBM AQ+Demo)] Fold 3: acc=0.9645, prec=0.9231, rec=0.9474, f1=0.9351, roc_auc=0.9931, pr_auc=0.9832
[C2 Adult (LightGBM AQ+Demo)] Fold 4: acc=0.9574, prec=0.9706, rec=0.8684, f1=0.9167, roc_auc=0.9972, pr_auc=0.9927
[C2 Adult (LightGBM AQ+Demo)] Fold 5: acc=0.9286, prec=0.8293, rec=0.9189, f1=0.8718, roc_auc=0.9890, pr_auc=0.9734

[C2 Adult (LightGBM AQ+Demo)] 5-fold mean metrics:
  accuracy: 0.9517
  precision: 0.9341
  recall: 0.8890
  f1: 0.9073
  roc_auc: 0.9938
  pr_auc: 0.9842
[C2 Child (LightGBM AQ+Demo)] Fold 1: acc=0.9661, prec=0.9643, rec=0.9643, f1=0.9643, roc_auc=0.9850, pr_auc=0.9880
[C2 Child (LightGBM AQ+Demo)] Fold 2: acc=0.9492, prec=1.0000, rec=0.8966, f1=0.9455, roc_auc=0.9862, pr_auc=0.9893
[C2 Child (LightGBM AQ+Demo)] Fo

In [7]:
# SAVE OUTPUTS

adult_folds.to_csv(RESULTS_DIR / "C2_adult_folds.csv", index=False)
adult_summary.to_csv(RESULTS_DIR / "C2_adult_summary.csv", index=False)

child_folds.to_csv(RESULTS_DIR / "C2_child_folds.csv", index=False)
child_summary.to_csv(RESULTS_DIR / "C2_child_summary.csv", index=False)

print("\nSaved C2 results to:", RESULTS_DIR)



Saved C2 results to: C:\Users\14ush\Desktop\asc-screening-xai-dissertation\results\C2


In [8]:
# SHUFFLED-LABEL SANITY CHECK

def shuffled_label_run(X, y, pipe, exp_name, random_state=42):
    y_shuf = shuffle(y, random_state=random_state)
    folds_df, summary_df = run_experiment(
        X, y_shuf, pipe,
        exp_name=f"{exp_name} (SHUFFLED LABELS)",
        n_splits=5,
        random_state=random_state
    )
    return folds_df, summary_df

adult_shuf_folds, adult_shuf_summary = shuffled_label_run(
    Xa, ya, c2_adult_pipe, "C2 Adult", random_state=42
)
child_shuf_folds, child_shuf_summary = shuffled_label_run(
    Xc, yc, c2_child_pipe, "C2 Child", random_state=42
)

adult_shuf_folds.to_csv(RESULTS_DIR / "C2_adult_shuffled_folds.csv", index=False)
adult_shuf_summary.to_csv(RESULTS_DIR / "C2_adult_shuffled_summary.csv", index=False)

child_shuf_folds.to_csv(RESULTS_DIR / "C2_child_shuffled_folds.csv", index=False)
child_shuf_summary.to_csv(RESULTS_DIR / "C2_child_shuffled_summary.csv", index=False)

print("\nSaved shuffled-label folds + summaries to:", RESULTS_DIR)

print("\nC2 Adult mean metrics:")
display(adult_summary)

print("\nC2 Child mean metrics:")
display(child_summary)

print("\nC2 Adult shuffled-label mean metrics:")
display(adult_shuf_summary)

print("\nC2 Child shuffled-label mean metrics:")
display(child_shuf_summary)

[C2 Adult (SHUFFLED LABELS)] Fold 1: acc=0.6667, prec=0.2353, rec=0.1053, f1=0.1455, roc_auc=0.5082, pr_auc=0.2772
[C2 Adult (SHUFFLED LABELS)] Fold 2: acc=0.6170, prec=0.2143, rec=0.1579, f1=0.1818, roc_auc=0.4714, pr_auc=0.2670
[C2 Adult (SHUFFLED LABELS)] Fold 3: acc=0.6312, prec=0.2812, rec=0.2368, f1=0.2571, roc_auc=0.5061, pr_auc=0.2759
[C2 Adult (SHUFFLED LABELS)] Fold 4: acc=0.6241, prec=0.1429, rec=0.0789, f1=0.1017, roc_auc=0.4847, pr_auc=0.2637
[C2 Adult (SHUFFLED LABELS)] Fold 5: acc=0.6786, prec=0.3571, rec=0.2703, f1=0.3077, roc_auc=0.6119, pr_auc=0.3833

[C2 Adult (SHUFFLED LABELS)] 5-fold mean metrics:
  accuracy: 0.6435
  precision: 0.2462
  recall: 0.1698
  f1: 0.1988
  roc_auc: 0.5165
  pr_auc: 0.2934
[C2 Child (SHUFFLED LABELS)] Fold 1: acc=0.5593, prec=0.5357, rec=0.5357, f1=0.5357, roc_auc=0.4988, pr_auc=0.5115
[C2 Child (SHUFFLED LABELS)] Fold 2: acc=0.4746, prec=0.4615, rec=0.4138, f1=0.4364, roc_auc=0.4494, pr_auc=0.4594
[C2 Child (SHUFFLED LABELS)] Fold 3: acc

Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.951672,0.93406,0.889047,0.907313,0.993759,0.984227



C2 Child mean metrics:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.955465,0.971429,0.936453,0.953112,0.990675,0.991884



C2 Adult shuffled-label mean metrics:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.643516,0.246166,0.169844,0.198761,0.516455,0.29342



C2 Child shuffled-label mean metrics:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.527469,0.509087,0.489901,0.498332,0.522155,0.541312
