# A3: Random Forest (AQ-10 + Demographic Features)

This notebook evaluates a random forest classifier trained on AQ-10 and demographic features using stratified cross-validation.

**Purpose:**  
To provide a non-linear baseline and assess whether tree-based models capture interactions not modelled by linear approaches.


In [1]:
from pathlib import Path
import sys

NOTEBOOK_DIR = Path.cwd()      
PROJECT_ROOT = NOTEBOOK_DIR.parent    

DATA_DIR = PROJECT_ROOT / "data"
RESULTS_DIR = PROJECT_ROOT / "results" / "A3"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

SRC_DIR = PROJECT_ROOT / "src"
sys.path.insert(0, str(SRC_DIR))

from common_setup import load_arff, prepare_df_AQ_DEMO, build_preprocessor, run_experiment

ADULT_ARFF_PATH = DATA_DIR / "Autism-Adult-Data.arff"
CHILD_ARFF_PATH = DATA_DIR / "Autism-Child-Data.arff"

In [2]:
adult_raw = load_arff(str(ADULT_ARFF_PATH))
child_raw = load_arff(str(CHILD_ARFF_PATH))

Xa, ya, num_a, cat_a, feat_a = prepare_df_AQ_DEMO(adult_raw, "Adult")
Xc, yc, num_c, cat_c, feat_c = prepare_df_AQ_DEMO(child_raw, "Child")



[Adult] AQ columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
[Adult] Dropping leaky column 'result'
[Adult] Using label column: 'Class/ASD'
[Adult] X shape: (704, 19)
[Adult] y counts: [515 189]

[Child] AQ columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
[Child] Dropping leaky column 'result'
[Child] Using label column: 'Class/ASD'
[Child] X shape: (292, 19)
[Child] y counts: [151 141]


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Preprocess: scale numeric + one-hot encode categorical
pre_a = build_preprocessor(num_a, cat_a)
pre_c = build_preprocessor(num_c, cat_c)

# A3 model: Random Forest (no class weighting)
rf_params = dict(
    n_estimators=500,
    random_state=42,
    n_jobs=-1
)

a3_adult_pipe = Pipeline([
    ("prep", pre_a),
    ("clf", RandomForestClassifier(**rf_params))
])

a3_child_pipe = Pipeline([
    ("prep", pre_c),
    ("clf", RandomForestClassifier(**rf_params))
])

print("A3 pipeline ready.")


A3 pipeline ready.


In [6]:
adult_folds, adult_summary = run_experiment(
    Xa, ya, a3_adult_pipe, exp_name="A3 Adult (RF AQ+Demo)", n_splits=5, random_state=42
)

child_folds, child_summary = run_experiment(
    Xc, yc, a3_child_pipe, exp_name="A3 Child (RF AQ+Demo)", n_splits=5, random_state=42
)

display(adult_summary)
display(child_summary)


[A3 Adult (RF AQ+Demo)] Fold 1: acc=0.9787, prec=0.9730, rec=0.9474, f1=0.9600, roc_auc=0.9974, pr_auc=0.9930
[A3 Adult (RF AQ+Demo)] Fold 2: acc=0.9645, prec=1.0000, rec=0.8684, f1=0.9296, roc_auc=0.9969, pr_auc=0.9924
[A3 Adult (RF AQ+Demo)] Fold 3: acc=0.9291, prec=0.8500, rec=0.8947, f1=0.8718, roc_auc=0.9859, pr_auc=0.9654
[A3 Adult (RF AQ+Demo)] Fold 4: acc=0.9504, prec=0.9697, rec=0.8421, f1=0.9014, roc_auc=0.9940, pr_auc=0.9841
[A3 Adult (RF AQ+Demo)] Fold 5: acc=0.9429, prec=0.9677, rec=0.8108, f1=0.8824, roc_auc=0.9930, pr_auc=0.9825

[A3 Adult (RF AQ+Demo)] 5-fold mean metrics:
  accuracy: 0.9531
  precision: 0.9521
  recall: 0.8727
  f1: 0.9090
  roc_auc: 0.9935
  pr_auc: 0.9835
[A3 Child (RF AQ+Demo)] Fold 1: acc=0.8983, prec=0.8929, rec=0.8929, f1=0.8929, roc_auc=0.9649, pr_auc=0.9659
[A3 Child (RF AQ+Demo)] Fold 2: acc=0.9322, prec=0.9630, rec=0.8966, f1=0.9286, roc_auc=0.9920, pr_auc=0.9916
[A3 Child (RF AQ+Demo)] Fold 3: acc=0.9655, prec=0.9643, rec=0.9643, f1=0.9643, 

Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.95311,0.952082,0.872688,0.909027,0.993474,0.983472


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.928171,0.941735,0.907882,0.923989,0.982435,0.98286


In [8]:
import numpy as np
from sklearn.utils import shuffle

def shuffled_label_run(X, y, pipe, exp_name, random_state=42):
    y_shuf = shuffle(y, random_state=random_state)
    folds_df, summary_df = run_experiment(
        X, y_shuf, pipe,
        exp_name=f"{exp_name} (SHUFFLED LABELS)",
        n_splits=5,
        random_state=random_state
    )
    return folds_df, summary_df

adult_shuf_folds, adult_shuf_summary = shuffled_label_run(Xa, ya, a3_adult_pipe, "A3 Adult")
child_shuf_folds, child_shuf_summary = shuffled_label_run(Xc, yc, a3_child_pipe, "A3 Child")

# Save shuffled outputs
adult_shuf_folds.to_csv(RESULTS_DIR / "A3_adult_shuffled_folds.csv", index=False)
adult_shuf_summary.to_csv(RESULTS_DIR / "A3_adult_shuffled_summary.csv", index=False)

child_shuf_folds.to_csv(RESULTS_DIR / "A3_child_shuffled_folds.csv", index=False)
child_shuf_summary.to_csv(RESULTS_DIR / "A3_child_shuffled_summary.csv", index=False)

print("Saved shuffled-label results to:", RESULTS_DIR)

display(adult_shuf_summary)
display(child_shuf_summary)

[A3 Adult (SHUFFLED LABELS)] Fold 1: acc=0.6809, prec=0.0000, rec=0.0000, f1=0.0000, roc_auc=0.4879, pr_auc=0.2669
[A3 Adult (SHUFFLED LABELS)] Fold 2: acc=0.6950, prec=0.0000, rec=0.0000, f1=0.0000, roc_auc=0.4479, pr_auc=0.2440
[A3 Adult (SHUFFLED LABELS)] Fold 3: acc=0.7021, prec=0.1667, rec=0.0263, f1=0.0455, roc_auc=0.4808, pr_auc=0.2576
[A3 Adult (SHUFFLED LABELS)] Fold 4: acc=0.6809, prec=0.1111, rec=0.0263, f1=0.0426, roc_auc=0.4512, pr_auc=0.2423
[A3 Adult (SHUFFLED LABELS)] Fold 5: acc=0.7071, prec=0.2500, rec=0.0541, f1=0.0889, roc_auc=0.4974, pr_auc=0.2928

[A3 Adult (SHUFFLED LABELS)] 5-fold mean metrics:
  accuracy: 0.6932
  precision: 0.1056
  recall: 0.0213
  f1: 0.0354
  roc_auc: 0.4730
  pr_auc: 0.2607
[A3 Child (SHUFFLED LABELS)] Fold 1: acc=0.5254, prec=0.5000, rec=0.3929, f1=0.4400, roc_auc=0.4781, pr_auc=0.4937
[A3 Child (SHUFFLED LABELS)] Fold 2: acc=0.4915, prec=0.4815, rec=0.4483, f1=0.4643, roc_auc=0.4983, pr_auc=0.5249
[A3 Child (SHUFFLED LABELS)] Fold 3: acc

Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.693202,0.105556,0.021337,0.035379,0.473032,0.260719


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.524079,0.507809,0.468227,0.482745,0.539682,0.548107


In [9]:
import numpy as np
from sklearn.utils import shuffle

def shuffled_label_run(X, y, pipe, exp_name, random_state=42):
    y_shuf = shuffle(y, random_state=random_state)
    folds_df, summary_df = run_experiment(
        X, y_shuf, pipe, exp_name=f"{exp_name} (SHUFFLED LABELS)", n_splits=5, random_state=random_state
    )
    return summary_df

adult_shuf_summary = shuffled_label_run(Xa, ya, a3_adult_pipe, "A3 Adult")
child_shuf_summary = shuffled_label_run(Xc, yc, a3_child_pipe, "A3 Child")

display(adult_shuf_summary)
display(child_shuf_summary)


[A3 Adult (SHUFFLED LABELS)] Fold 1: acc=0.6809, prec=0.0000, rec=0.0000, f1=0.0000, roc_auc=0.4879, pr_auc=0.2669
[A3 Adult (SHUFFLED LABELS)] Fold 2: acc=0.6950, prec=0.0000, rec=0.0000, f1=0.0000, roc_auc=0.4479, pr_auc=0.2440
[A3 Adult (SHUFFLED LABELS)] Fold 3: acc=0.7021, prec=0.1667, rec=0.0263, f1=0.0455, roc_auc=0.4808, pr_auc=0.2576
[A3 Adult (SHUFFLED LABELS)] Fold 4: acc=0.6809, prec=0.1111, rec=0.0263, f1=0.0426, roc_auc=0.4512, pr_auc=0.2423
[A3 Adult (SHUFFLED LABELS)] Fold 5: acc=0.7071, prec=0.2500, rec=0.0541, f1=0.0889, roc_auc=0.4974, pr_auc=0.2928

[A3 Adult (SHUFFLED LABELS)] 5-fold mean metrics:
  accuracy: 0.6932
  precision: 0.1056
  recall: 0.0213
  f1: 0.0354
  roc_auc: 0.4730
  pr_auc: 0.2607
[A3 Child (SHUFFLED LABELS)] Fold 1: acc=0.5254, prec=0.5000, rec=0.3929, f1=0.4400, roc_auc=0.4781, pr_auc=0.4937
[A3 Child (SHUFFLED LABELS)] Fold 2: acc=0.4915, prec=0.4815, rec=0.4483, f1=0.4643, roc_auc=0.4983, pr_auc=0.5249
[A3 Child (SHUFFLED LABELS)] Fold 3: acc

Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.693202,0.105556,0.021337,0.035379,0.473032,0.260719


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,0.524079,0.507809,0.468227,0.482745,0.539682,0.548107
