# A1: Logistic Regression (AQ-10 Only)

This notebook implements a baseline logistic regression model using only AQ-10 behavioural questionnaire features for autism screening.

**Purpose:**  
To establish a simple baseline and illustrate the deterministic nature of AQ-based screening scores under standard linear modelling.


In [4]:
import re
import numpy as np
import pandas as pd
from pathlib import Path

from scipy.io import arff

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)
from sklearn.utils import shuffle

In [23]:
# PATHS
NOTEBOOK_DIR = Path.cwd()             
PROJECT_ROOT = NOTEBOOK_DIR.parent     

DATA_DIR = PROJECT_ROOT / "data"
RESULTS_DIR = PROJECT_ROOT / "results" / "A1"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

ADULT_ARFF_PATH = DATA_DIR / "Autism-Adult-Data.arff"
CHILD_ARFF_PATH = DATA_DIR / "Autism-Child-Data.arff"

In [25]:
# LOAD ARFF -> DataFrame

def load_arff_to_df(path: Path) -> pd.DataFrame:
    data, meta = arff.loadarff(str(path))   # <- important: str(path)
    df = pd.DataFrame(data)

    # Decode bytes to str for object columns
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

    # Strip whitespace in column names (safe cleanup)
    df.columns = [c.strip() for c in df.columns]

    print(f"\nLoaded {path} with shape {df.shape}")
    print("Columns:", df.columns.tolist())
    return df

In [27]:
# Prepare dataframe for A1 (AQ-only)

def prepare_df_for_A1(df: pd.DataFrame, dataset_name: str):
    df = df.copy()

    # Detect AQ columns (works with A1_Score ... A10_Score OR a1_score ... a10_score)
    aq_pattern = re.compile(r"^a(\d+)_score$", re.IGNORECASE)
    aq_cols = []
    for c in df.columns:
        m = aq_pattern.match(c)
        if m:
            aq_cols.append((int(m.group(1)), c))

    if len(aq_cols) == 0:
        raise ValueError(
            f"[{dataset_name}] Could not find any AQ columns matching a*_score. "
            f"Columns: {df.columns.tolist()}"
        )

    # Sort AQ columns by item number
    aq_cols = [name for (idx, name) in sorted(aq_cols, key=lambda t: t[0])]

    print(f"\n[{dataset_name}] Detected AQ columns:", aq_cols)

    # Clean AQ values: ensure numeric int
    for c in aq_cols:
        df[c] = df[c].astype(str).str.strip()
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

    print(f"[{dataset_name}] AQ dtypes after cleaning:")
    print(df[aq_cols].dtypes)

    # Drop leaky derived AQ total if present
    if "result" in df.columns:
        print(f"[{dataset_name}] Dropping leaky column: 'result'")
        df = df.drop(columns=["result"])

    # Detect label/target column (e.g., "Class/ASD")
    label_candidates = [c for c in df.columns if "class" in c.lower() or "asd" in c.lower()]
    print(f"[{dataset_name}] Label candidates:", label_candidates)

    if len(label_candidates) == 0:
        raise ValueError(f"[{dataset_name}] Could not find a label column (no 'class' or 'asd' in names).")

    # Prefer "Class/ASD" if present, otherwise first "class*" etc.
    preferred_order = sorted(
        label_candidates,
        key=lambda c: (
            0 if "class/asd" in c.lower() else
            1 if c.lower().startswith("class") else
            2 if "asd" in c.lower() else
            3
        )
    )
    target_col = preferred_order[0]
    print(f"[{dataset_name}] Using label column:", repr(target_col))

    # Encode y to 0/1
    y_raw = df[target_col].astype(str).str.strip().str.upper()
    if set(y_raw.unique()) <= {"YES", "NO"}:
        y = y_raw.map({"NO": 0, "YES": 1}).values
    elif set(y_raw.unique()) <= {"0", "1"}:
        y = y_raw.astype(int).values
    else:
        print(f"[{dataset_name}] Unrecognised label values:", y_raw.unique())
        raise ValueError(f"[{dataset_name}] Unknown label encoding in column {target_col!r}")

    X = df[aq_cols].copy()

    print(f"[{dataset_name}] X shape: {X.shape}")
    print(f"[{dataset_name}] y value counts:", np.bincount(y))

    return X, y, aq_cols, target_col

In [29]:
# Run CV

def run_A1_experiment(
    X: pd.DataFrame,
    y: np.ndarray,
    dataset_name: str,
    folds: int = 5,
    random_state: int = 42
):
    """
    Run A1 (AQ-only) experiment:
    - StandardScaler + LogisticRegression
    - StratifiedKFold CV
    - Returns folds_df and summary_df
    """
    pipe = Pipeline(steps=[
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=3000, solver="lbfgs"))
    ])

    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)

    fold_rows = []
    metrics = {"accuracy": [], "precision": [], "recall": [], "f1": [], "roc_auc": [], "pr_auc": []}

    for fold_idx, (tr, te) in enumerate(cv.split(X, y), 1):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        ytr, yte = y[tr], y[te]

        pipe.fit(Xtr, ytr)
        y_pred = pipe.predict(Xte)
        y_proba = pipe.predict_proba(Xte)[:, 1]

        acc = accuracy_score(yte, y_pred)
        prec = precision_score(yte, y_pred, zero_division=0)
        rec = recall_score(yte, y_pred, zero_division=0)
        f1 = f1_score(yte, y_pred, zero_division=0)
        roc = roc_auc_score(yte, y_proba)
        pr = average_precision_score(yte, y_proba)

        metrics["accuracy"].append(acc)
        metrics["precision"].append(prec)
        metrics["recall"].append(rec)
        metrics["f1"].append(f1)
        metrics["roc_auc"].append(roc)
        metrics["pr_auc"].append(pr)

        fold_rows.append({
            "fold": fold_idx,
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1": f1,
            "roc_auc": roc,
            "pr_auc": pr
        })

        print(
            f"[A1 {dataset_name}] Fold {fold_idx}: "
            f"acc={acc:.4f}, prec={prec:.4f}, rec={rec:.4f}, f1={f1:.4f}, "
            f"roc_auc={roc:.4f}, pr_auc={pr:.4f}"
        )

    summary = {m: float(np.mean(v)) for m, v in metrics.items()}

    print(f"\n[A1 {dataset_name}] {folds}-fold mean metrics:")
    for k, v in summary.items():
        print(f"  {k}: {v:.4f}")

    folds_df = pd.DataFrame(fold_rows)
    summary_df = pd.DataFrame([summary])
    return folds_df, summary_df


In [43]:
# SHUFFLED-LABEL sanity check

def shuffled_label_check(
    X: pd.DataFrame,
    y: np.ndarray,
    dataset_name: str,
    folds: int = 5,
    random_state: int = 42
):
   
    y_shuf = shuffle(y, random_state=random_state)

    pipe = Pipeline(steps=[
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=3000, solver="lbfgs"))
    ])

    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)

    accs, rocs = [], []
    for tr, te in cv.split(X, y_shuf):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        ytr, yte = y_shuf[tr], y_shuf[te]

        pipe.fit(Xtr, ytr)
        y_pred = pipe.predict(Xte)
        y_proba = pipe.predict_proba(Xte)[:, 1]

        accs.append(accuracy_score(yte, y_pred))
        rocs.append(roc_auc_score(yte, y_proba))

    mean_acc = float(np.mean(accs))
    mean_roc = float(np.mean(rocs))
    print(f"\n[A1 {dataset_name} – SHUFFLED LABELS] mean accuracy: {mean_acc:.4f}, mean ROC-AUC: {mean_roc:.4f}")
    return mean_acc, mean_roc


In [45]:
# RUN A1: Adult + Child

adult_raw = load_arff_to_df(ADULT_ARFF_PATH)
child_raw = load_arff_to_df(CHILD_ARFF_PATH)

Xa, ya, aq_adult, ycol_adult = prepare_df_for_A1(adult_raw, "Adult")
Xc, yc, aq_child, ycol_child = prepare_df_for_A1(child_raw, "Child")

adult_folds_A1, adult_summary_A1 = run_A1_experiment(Xa, ya, "Adult")
child_folds_A1, child_summary_A1 = run_A1_experiment(Xc, yc, "Child")




Loaded C:\Users\14ush\Desktop\asc-screening-xai-dissertation\data\Autism-Adult-Data.arff with shape (704, 21)
Columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender', 'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before', 'result', 'age_desc', 'relation', 'Class/ASD']

Loaded C:\Users\14ush\Desktop\asc-screening-xai-dissertation\data\Autism-Child-Data.arff with shape (292, 21)
Columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender', 'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before', 'result', 'age_desc', 'relation', 'Class/ASD']

[Adult] Detected AQ columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score']
[Adult] AQ dtypes after cleaning:
A1_Score     int32
A2_Score     int32
A3_Score     int32
A4_Sc

In [47]:
# SAVE RESULTS

adult_folds_A1.to_csv(RESULTS_DIR / "A1_adult_folds.csv", index=False)
adult_summary_A1.to_csv(RESULTS_DIR / "A1_adult_summary.csv", index=False)

child_folds_A1.to_csv(RESULTS_DIR / "A1_child_folds.csv", index=False)
child_summary_A1.to_csv(RESULTS_DIR / "A1_child_summary.csv", index=False)

print("\nSaved A1 results to:", RESULTS_DIR)



Saved A1 results to: C:\Users\14ush\Desktop\asc-screening-xai-dissertation\results\A1


In [49]:
#  SHUFFLED-LABEL check

adult_shuf_acc, adult_shuf_roc = shuffled_label_check(Xa, ya, "Adult")
child_shuf_acc, child_shuf_roc = shuffled_label_check(Xc, yc, "Child")

print("\n==== SUMMARY ====")
print("A1 Adult mean metrics:")
display(adult_summary_A1)

print("\nA1 Child mean metrics:")
display(child_summary_A1)

print(f"\nShuffled-label (Adult): acc={adult_shuf_acc:.4f}, roc_auc={adult_shuf_roc:.4f}")
print(f"Shuffled-label (Child): acc={child_shuf_acc:.4f}, roc_auc={child_shuf_roc:.4f}")


[A1 Adult – SHUFFLED LABELS] mean accuracy: 0.7315, mean ROC-AUC: 0.5171

[A1 Child – SHUFFLED LABELS] mean accuracy: 0.5204, mean ROC-AUC: 0.4997

==== SUMMARY ====
A1 Adult mean metrics:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,1.0,1.0,1.0,1.0,1.0,1.0



A1 Child mean metrics:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc,pr_auc
0,1.0,1.0,1.0,1.0,1.0,1.0



Shuffled-label (Adult): acc=0.7315, roc_auc=0.5171
Shuffled-label (Child): acc=0.5204, roc_auc=0.4997
