# B1: Class-weighted Logistic Regression

This notebook implements a class-weighted logistic regression model to address class imbalance in the autism screening datasets.

**Purpose:**  
To evaluate the effectR impact of class weighting on screening performance and robustness under imbalanced class distributions.


In [1]:
from pathlib import Path
import sys
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from IPython.display import display


In [18]:
# PATHS

NOTEBOOK_DIR = Path.cwd()      
PROJECT_ROOT = NOTEBOOK_DIR.parent       

DATA_DIR = PROJECT_ROOT / "data"
SRC_DIR = PROJECT_ROOT / "src"
RESULTS_DIR = PROJECT_ROOT / "results" / "B1"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Make sure we import src/common_setup.py
sys.path.insert(0, str(SRC_DIR))

from common_setup import (
    load_arff,
    prepare_df_AQ_DEMO,
    build_preprocessor,
    run_experiment
)

ADULT_ARFF_PATH = DATA_DIR / "Autism-Adult-Data.arff"
CHILD_ARFF_PATH = DATA_DIR / "Autism-Child-Data.arff"

In [20]:
# LOAD DATA (ARFF)

adult_raw = load_arff(ADULT_ARFF_PATH)
child_raw = load_arff(CHILD_ARFF_PATH)

print("\nLoaded adult shape:", adult_raw.shape)
print("Loaded child shape:", child_raw.shape)


Loaded adult shape: (704, 21)
Loaded child shape: (292, 21)


In [22]:
# PREP FEATURES (AQ + DEMO)

Xa, ya, num_adult, cat_adult, feat_adult = prepare_df_AQ_DEMO(adult_raw, "Adult")
Xc, yc, num_child, cat_child, feat_child = prepare_df_AQ_DEMO(child_raw, "Child")



[Adult] AQ columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
[Adult] Dropping leaky column 'result'
[Adult] Using label column: 'Class/ASD'
[Adult] X shape: (704, 19)
[Adult] y counts: [515 189]

[Child] AQ columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
[Child] Dropping leaky column 'result'
[Child] Using label column: 'Class/ASD'
[Child] X shape: (292, 19)
[Child] y counts: [151 141]


In [24]:
# BUILD B1 PIPELINES (Class-weighted Logistic Regression)
# ------------------------------------------------
pre_adult = build_preprocessor(num_adult, cat_adult)
pre_child = build_preprocessor(num_child, cat_child)

# separate classifier objects (cleaner + avoids any accidental shared state)
b1_adult_clf = LogisticRegression(
    max_iter=3000,
    class_weight="balanced",
    solver="lbfgs"
)

b1_child_clf = LogisticRegression(
    max_iter=3000,
    class_weight="balanced",
    solver="lbfgs"
)

b1_adult_pipe = Pipeline([
    ("prep", pre_adult),
    ("clf", b1_adult_clf)
])

b1_child_pipe = Pipeline([
    ("prep", pre_child),
    ("clf", b1_child_clf)
])


In [26]:
# RUN 5-FOLD CV

adult_folds, adult_summary = run_experiment(
    Xa, ya, b1_adult_pipe,
    exp_name="B1 Adult (Class-weighted LR AQ+Demo)",
    n_splits=5,
    random_state=42
)

child_folds, child_summary = run_experiment(
    Xc, yc, b1_child_pipe,
    exp_name="B1 Child (Class-weighted LR AQ+Demo)",
    n_splits=5,
    random_state=42
)

[B1 Adult (Class-weighted LR AQ+Demo)] Fold 1: acc=1.0000, prec=1.0000, rec=1.0000, f1=1.0000, roc_auc=1.0000, pr_auc=1.0000
[B1 Adult (Class-weighted LR AQ+Demo)] Fold 2: acc=1.0000, prec=1.0000, rec=1.0000, f1=1.0000, roc_auc=1.0000, pr_auc=1.0000
[B1 Adult (Class-weighted LR AQ+Demo)] Fold 3: acc=0.9858, prec=0.9500, rec=1.0000, f1=0.9744, roc_auc=1.0000, pr_auc=1.0000
[B1 Adult (Class-weighted LR AQ+Demo)] Fold 4: acc=1.0000, prec=1.0000, rec=1.0000, f1=1.0000, roc_auc=1.0000, pr_auc=1.0000
[B1 Adult (Class-weighted LR AQ+Demo)] Fold 5: acc=1.0000, prec=1.0000, rec=1.0000, f1=1.0000, roc_auc=1.0000, pr_auc=1.0000

[B1 Adult (Class-weighted LR AQ+Demo)] 5-fold mean metrics:
  accuracy: 0.9972
  precision: 0.9900
  recall: 1.0000
  f1: 0.9949
  roc_auc: 1.0000
  pr_auc: 1.0000
[B1 Child (Class-weighted LR AQ+Demo)] Fold 1: acc=1.0000, prec=1.0000, rec=1.0000, f1=1.0000, roc_auc=1.0000, pr_auc=1.0000
[B1 Child (Class-weighted LR AQ+Demo)] Fold 2: acc=1.0000, prec=1.0000, rec=1.0000, f

In [27]:
# SAVE OUTPUTS

adult_folds.to_csv(RESULTS_DIR / "B1_adult_folds.csv", index=False)
adult_summary.to_csv(RESULTS_DIR / "B1_adult_summary.csv", index=False)

child_folds.to_csv(RESULTS_DIR / "B1_child_folds.csv", index=False)
child_summary.to_csv(RESULTS_DIR / "B1_child_summary.csv", index=False)

print("\nSaved B1 results to:", RESULTS_DIR)



Saved B1 results to: C:\Users\14ush\Desktop\asc-screening-xai-dissertation\results\B1


In [30]:
# SHUFFLED-LABEL SANITY CHECK

def shuffled_label_run(X, y, pipe, exp_name, random_state=42):
    y_shuf = shuffle(y, random_state=random_state)
    _, summary_df = run_experiment(
        X, y_shuf, pipe,
        exp_name=f"{exp_name} (SHUFFLED LABELS)",
        n_splits=5,
        random_state=random_state
    )
    return summary_df

adult_shuf_summary = shuffled_label_run(Xa, ya, b1_adult_pipe, "B1 Adult", random_state=42)
child_shuf_summary = shuffled_label_run(Xc, yc, b1_child_pipe, "B1 Child", random_state=42)

adult_shuf_summary.to_csv(RESULTS_DIR / "B1_adult_shuffled_summary.csv", index=False)
child_shuf_summary.to_csv(RESULTS_DIR / "B1_child_shuffled_summary.csv", index=False)

print("\nSaved shuffled-label summaries to:", RESULTS_DIR)

print("\nB1 Adult mean metrics:")
display(adult_summary)

print("\nB1 Child mean metrics:")
display(child_summary)

print("\nB1 Adult shuffled-label mean metrics:")
display(adult_shuf_summary)

print("\nB1 Child shuffled-label mean metrics:")
display(child_shuf_summary)


[B1 Adult (SHUFFLED LABELS)] Fold 1: acc=0.5674, prec=0.2909, rec=0.4211, f1=0.3441, roc_auc=0.5261, pr_auc=0.2854
[B1 Adult (SHUFFLED LABELS)] Fold 2: acc=0.5674, prec=0.3115, rec=0.5000, f1=0.3838, roc_auc=0.5353, pr_auc=0.3441
[B1 Adult (SHUFFLED LABELS)] Fold 3: acc=0.5532, prec=0.2881, rec=0.4474, f1=0.3505, roc_auc=0.5120, pr_auc=0.3031
[B1 Adult (SHUFFLED LABELS)] Fold 4: acc=0.5532, prec=0.2951, rec=0.4737, f1=0.3636, roc_auc=0.5243, pr_auc=0.2809
[B1 Adult (SHUFFLED LABELS)] Fold 5: acc=0.5286, prec=0.2456, rec=0.3784, f1=0.2979, roc_auc=0.4933, pr_auc=0.2807

[B1 Adult (SHUFFLED LABELS)] 5-fold mean metrics:
  accuracy: 0.5539
  precision: 0.2862
  recall: 0.4441
  f1: 0.3480
  roc_auc: 0.5182
  pr_auc: 0.2989
[B1 Child (SHUFFLED LABELS)] Fold 1: acc=0.5085, prec=0.4783, rec=0.3929, f1=0.4314, roc_auc=0.4067, pr_auc=0.4313
[B1 Child (SHUFFLED LABELS)] Fold 2: acc=0.4407, prec=0.4333, rec=0.4483, f1=0.4407, roc_auc=0.4471, pr_auc=0.4659
[B1 Child (SHUFFLED LABELS)] Fold 3: acc

Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.997163,0.99,1.0,0.994872,1.0,1.0



B1 Child mean metrics:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,1.0,1.0,1.0,1.0,1.0,1.0



B1 Adult shuffled-label mean metrics:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.553941,0.286243,0.444097,0.34799,0.518181,0.298855



B1 Child shuffled-label mean metrics:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.493279,0.475763,0.496798,0.483657,0.452547,0.459537
