# Age 2–3.5 (Questionnaire) — Full Model Training + Testing (Google Colab)

This notebook trains the **Age 2–3.5** autism screening model using:

- **Training**: external questionnaire datasets (prepared CSV)
- **Testing**: your hospital-collected questionnaire data (prepared CSV)

It produces professional outputs:
- ROC curves (CV folds + test)
- Precision–Recall curve
- Calibration curve
- Confusion matrix
- Feature importance (Logistic Regression coefficients)
- Saved model artifacts (`.pkl` + `.json`) ready for your ML engine

## What you must upload to Colab
Upload a ZIP containing your project (recommended):
- `Autism_Screening_Tool_25-26J-273.zip` **OR**
- `ML_TRAINING.zip` + `SAMPLE_DATASETS.zip` (must include `SAMPLE_DATASETS/prepared/*.csv`)

This notebook expects these prepared files:
- `SAMPLE_DATASETS/prepared/train_age_2_3_5_external.csv`
- `SAMPLE_DATASETS/prepared/test_age_2_3_5_hospital.csv`

If they are missing, the notebook can run `ML_TRAINING/prepare_train_test_datasets.py` to generate them.


In [None]:
# Step 1: Install dependencies
!pip install -q pandas numpy scikit-learn matplotlib seaborn scipy joblib

print('[OK] Packages installed')


In [None]:
# Step 2: Upload and extract your project ZIP
from google.colab import files
import zipfile
from pathlib import Path
import os

print('Upload ONE zip (recommended): your full project zip')
print('OR upload multiple zips: ML_TRAINING.zip + SAMPLE_DATASETS.zip + Online_Datasets.zip')
uploaded = files.upload()

for name in uploaded.keys():
    if not name.lower().endswith('.zip'):
        print('[WARN] Skipping non-zip:', name)
        continue
    with zipfile.ZipFile(name, 'r') as z:
        z.extractall('/content')
    print('[OK] Extracted:', name)

print('[OK] Extraction done')
print('Top-level /content folders:', [p.name for p in Path('/content').iterdir() if p.is_dir()])


In [None]:
# Step 3: Auto-locate the project root and set working directory
from pathlib import Path
import os

content_root = Path('/content')

# Find the folder that contains ML_TRAINING and SAMPLE_DATASETS
ml_dirs = list(content_root.rglob('ML_TRAINING'))
if not ml_dirs:
    raise FileNotFoundError('ML_TRAINING folder not found under /content. Check your zip structure.')

ml_dir = sorted(ml_dirs, key=lambda p: len(str(p)))[0]
project_root = ml_dir.parent

print('[OK] ML_TRAINING found at:', ml_dir)
print('[OK] Project root assumed as:', project_root)

# Check expected folders
print('Has SAMPLE_DATASETS:', (project_root / 'SAMPLE_DATASETS').exists())
print('Has Online Datasets:', (project_root / 'Online Datasets').exists())

os.chdir(project_root)
print('[OK] CWD:', os.getcwd())


In [None]:
# Step 4: Load prepared Train/Test CSVs (or generate them if missing)
import pandas as pd
from pathlib import Path
import subprocess

train_path = Path('SAMPLE_DATASETS/prepared/train_age_2_3_5_external.csv')
test_path = Path('SAMPLE_DATASETS/prepared/test_age_2_3_5_hospital.csv')

if not train_path.exists() or not test_path.exists():
    print('[WARN] Prepared CSVs not found. Generating them now...')
    # This script reads from Online Datasets/ and hospital CSVs
    # Make sure you uploaded those folders in your ZIP.
    subprocess.run(['python', 'ML_TRAINING/prepare_train_test_datasets.py'], check=True)

assert train_path.exists(), f'Missing: {train_path}'
assert test_path.exists(), f'Missing: {test_path}'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print('[OK] Loaded training set:', train_df.shape)
print('[OK] Loaded test set:', test_df.shape)

print('\nTraining class counts (group):')
print(train_df['group'].value_counts(dropna=False))

print('\nTest class counts (group):')
print(test_df['group'].value_counts(dropna=False))

display(train_df.head(3))


In [None]:
# Step 5: Quick data QA + exploratory plots
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 4)

FEATURES = [
    'age_months',
    'critical_items_failed',
    'completion_time_sec',
    'social_responsiveness_zscore',
    'joint_attention_zscore',
    'total_score_zscore',
    'low_attention_flag',
    'high_critical_items_flag',
    'low_social_flag'
]
TARGET = 'group'

# Basic checks
print('[QA] Missing values (train):')
print(train_df[FEATURES + [TARGET]].isna().sum())

print('\n[QA] Missing values (test):')
print(test_df[FEATURES + [TARGET]].isna().sum())

# Class balance plot
fig, ax = plt.subplots(1, 2, figsize=(10, 3))
sns.countplot(x=TARGET, data=train_df, ax=ax[0])
ax[0].set_title('Training class balance')

sns.countplot(x=TARGET, data=test_df, ax=ax[1])
ax[1].set_title('Test (hospital) class balance')

plt.tight_layout()
plt.show()

# Feature distributions (train) for key features
key_feats = ['critical_items_failed', 'social_responsiveness_zscore', 'total_score_zscore']
fig, axes = plt.subplots(1, len(key_feats), figsize=(14, 3))
for i, col in enumerate(key_feats):
    sns.kdeplot(data=train_df, x=col, hue=TARGET, fill=True, common_norm=False, ax=axes[i])
    axes[i].set_title(f'Train KDE: {col}')
plt.tight_layout()
plt.show()


In [None]:
# Step 6: Outlier handling (Winsorization on continuous features)
# Clinical note: we cap extreme values instead of deleting children.

import numpy as np

CONTINUOUS = [
    'age_months',
    'critical_items_failed',
    'completion_time_sec',
    'social_responsiveness_zscore',
    'joint_attention_zscore',
    'total_score_zscore'
]

def winsorize_df(df, cols, lower_q=0.01, upper_q=0.99):
    out = df.copy()
    caps = {}
    for c in cols:
        lo = out[c].quantile(lower_q)
        hi = out[c].quantile(upper_q)
        out[c] = out[c].clip(lo, hi)
        caps[c] = (float(lo), float(hi))
    return out, caps

# Boxplots before
fig, axes = plt.subplots(1, 3, figsize=(14, 3))
for ax, col in zip(axes, ['completion_time_sec','social_responsiveness_zscore','total_score_zscore']):
    sns.boxplot(data=train_df, x=TARGET, y=col, ax=ax)
    ax.set_title('Before: ' + col)
plt.tight_layout(); plt.show()

train_w, caps = winsorize_df(train_df, CONTINUOUS, 0.01, 0.99)
print('[OK] Winsorization caps (1% / 99%):')
for k,v in caps.items():
    print(' -', k, v)

# Boxplots after
fig, axes = plt.subplots(1, 3, figsize=(14, 3))
for ax, col in zip(axes, ['completion_time_sec','social_responsiveness_zscore','total_score_zscore']):
    sns.boxplot(data=train_w, x=TARGET, y=col, ax=ax)
    ax.set_title('After: ' + col)
plt.tight_layout(); plt.show()


In [None]:
# Step 7: Bootstrap augmentation to reduce class imbalance (training only)
from sklearn.utils import resample

X_train = train_w[FEATURES].copy()
y_train = train_w[TARGET].astype(int).copy()

counts = y_train.value_counts().to_dict()
print('[INFO] Before augmentation counts:', counts)

# Target: balance to the majority count
maj = max(counts, key=counts.get)
minc = min(counts, key=counts.get)

n_target = counts[maj]

df_train = X_train.copy()
df_train[TARGET] = y_train

df_maj = df_train[df_train[TARGET] == maj]
df_min = df_train[df_train[TARGET] == minc]

# Upsample minority
min_up = resample(df_min, replace=True, n_samples=n_target, random_state=42)
train_aug = pd.concat([df_maj, min_up], ignore_index=True).sample(frac=1, random_state=42)

X_aug = train_aug[FEATURES]
y_aug = train_aug[TARGET].astype(int)

print('[OK] After augmentation counts:', y_aug.value_counts().to_dict())

# Plot
plt.figure(figsize=(4,3))
sns.countplot(x=y_aug)
plt.title('Augmented training class balance')
plt.tight_layout(); plt.show()


In [None]:
# Step 8: Cross-validated ROC curves (like your example figure)
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc

X_cv = X_aug.values
y_cv = y_aug.values

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

plt.figure(figsize=(6,5))
aucs = []
for fold, (tr, va) in enumerate(cv.split(X_cv, y_cv)):
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear'))
    ])
    pipe.fit(X_cv[tr], y_cv[tr])
    proba = pipe.predict_proba(X_cv[va])[:, 1]

    fpr, tpr, _ = roc_curve(y_cv[va], proba)
    fold_auc = auc(fpr, tpr)
    aucs.append(fold_auc)
    plt.plot(fpr, tpr, label=f'ROC fold {fold} (AUC = {fold_auc:.2f})')

plt.plot([0, 1], [0, 1], '--', color='brown')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curves (5-fold CV) — Age 2–3.5')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

print('[OK] Mean CV AUC:', float(np.mean(aucs)))


In [None]:
# Step 9: Train final model on augmented training set + evaluate on hospital test set
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score,
    precision_recall_curve, average_precision_score
)
from sklearn.calibration import calibration_curve

# Final pipeline
final_model = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear'))
])

final_model.fit(X_aug, y_aug)

# Test evaluation
X_test = test_df[FEATURES].copy()
y_test = test_df[TARGET].astype(int).copy()

y_proba = final_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)

metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, zero_division=0),
    'recall': recall_score(y_test, y_pred, zero_division=0),
    'f1': f1_score(y_test, y_pred, zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_proba) if len(set(y_test)) > 1 else None,
    'pr_auc': average_precision_score(y_test, y_proba) if len(set(y_test)) > 1 else None,
}

print('[OK] Test metrics (hospital test set):')
for k,v in metrics.items():
    print(f' - {k}: {v}')

print('\nClassification report:')
print(classification_report(y_test, y_pred, digits=3, zero_division=0))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Control','ASD'], yticklabels=['Control','ASD'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion matrix (hospital test)')
plt.tight_layout(); plt.show()

# ROC curve (test)
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"Test ROC AUC = {metrics['roc_auc']:.2f}")
plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC (hospital test)')
plt.legend(loc='lower right')
plt.tight_layout(); plt.show()

# Precision-Recall curve (test)
prec, rec, _ = precision_recall_curve(y_test, y_proba)
plt.figure(figsize=(5,4))
plt.plot(rec, prec)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f"Precision-Recall (AP = {metrics['pr_auc']:.2f})")
plt.tight_layout(); plt.show()

# Calibration curve (test)
frac_pos, mean_pred = calibration_curve(y_test, y_proba, n_bins=8)
plt.figure(figsize=(5,4))
plt.plot(mean_pred, frac_pos, marker='o')
plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel('Mean predicted probability')
plt.ylabel('Fraction of positives')
plt.title('Calibration curve (hospital test)')
plt.tight_layout(); plt.show()


In [None]:
# Step 10: Feature importance (Logistic Regression coefficients)
import pandas as pd

lr = final_model.named_steps['lr']
coef = lr.coef_.ravel()
feat_imp = pd.DataFrame({'feature': FEATURES, 'coef': coef})
feat_imp['abs_coef'] = feat_imp['coef'].abs()
feat_imp = feat_imp.sort_values('abs_coef', ascending=False)

plt.figure(figsize=(7,4))
sns.barplot(data=feat_imp, y='feature', x='coef')
plt.axvline(0, color='black', linewidth=1)
plt.title('Logistic Regression coefficients (direction of risk)')
plt.tight_layout(); plt.show()

display(feat_imp[['feature','coef']])


In [None]:
# Step 11: Save model artifacts (pkl + json) and download as ZIP
from pathlib import Path
from google.colab import files
import json
import joblib
from datetime import datetime
import zipfile

out_dir = Path('ML_TRAINING/models')
out_dir.mkdir(parents=True, exist_ok=True)

model_name = 'model_age_2_3_5_questionnaire_colab'

model_path = out_dir / f'{model_name}.pkl'
joblib.dump(final_model, model_path)

features_path = out_dir / f'features_{model_name}.json'
features_path.write_text(json.dumps(FEATURES, indent=2))

metadata = {
    'model_name': model_name,
    'age_group': '2-3.5',
    'train_file': str(train_path),
    'test_file': str(test_path),
    'features': FEATURES,
    'algorithm': 'LogisticRegression (StandardScaler + LR pipeline)',
    'trained_at': datetime.utcnow().isoformat() + 'Z',
    'test_metrics': metrics,
}
metadata_path = out_dir / f'model_metadata_{model_name}.json'
metadata_path.write_text(json.dumps(metadata, indent=2, default=str))

print('[OK] Saved:')
print(' -', model_path)
print(' -', features_path)
print(' -', metadata_path)

zip_name = 'age_2_3_5_trained_model_and_reports.zip'
with zipfile.ZipFile(zip_name, 'w') as z:
    z.write(model_path)
    z.write(features_path)
    z.write(metadata_path)

print('[OK] Created zip:', zip_name)
files.download(zip_name)


In [None]:
# Step 12: Predict for a NEW random child (example like your snippet)
# IMPORTANT: the new child must provide the same FEATURES used by the model.

new_child = {
    'age_months': 30,
    'critical_items_failed': 5,
    'completion_time_sec': 360,
    'social_responsiveness_zscore': 0.2,
    'joint_attention_zscore': -0.4,
    'total_score_zscore': 0.6,
    'low_attention_flag': 1,
    'high_critical_items_flag': 1,
    'low_social_flag': 1,
}

new_child_df = pd.DataFrame([new_child])
proba = final_model.predict_proba(new_child_df)[0]
pred = int(final_model.predict(new_child_df)[0])

print('PREDICTION RESULT')
print('=' * 40)
print('Diagnosis:', 'ASD RISK' if pred == 1 else 'No ASD concern')
print('Confidence:', float(max(proba)))
print('ASD Probability:', float(proba[1]))
print('Control Probability:', float(proba[0]))


In [None]:
# (Optional) Dataset flow diagram (simple, paper-ready)
import matplotlib.pyplot as plt

n_train = len(train_df)
n_test = len(test_df)
train_asd = int(train_df['group'].sum())
train_ctrl = int(n_train - train_asd)
test_asd = int(test_df['group'].sum())
test_ctrl = int(n_test - test_asd)

fig, ax = plt.subplots(figsize=(10, 3))
ax.axis('off')

boxes = [
    (0.05, 0.55, f"External training set\ntrain_age_2_3_5_external.csv\nN={n_train} (ASD={train_asd}, Control={train_ctrl})"),
    (0.40, 0.55, "Train LR model\n(StandardScaler + LogisticRegression)\n+ bootstrap balancing"),
    (0.75, 0.55, f"Hospital test set\ntest_age_2_3_5_hospital.csv\nN={n_test} (ASD={test_asd}, Control={test_ctrl})"),
]

for x, y, text in boxes:
    ax.add_patch(plt.Rectangle((x, y), 0.22, 0.30, fill=False, linewidth=1.5))
    ax.text(x + 0.11, y + 0.15, text, ha='center', va='center', fontsize=9)

# arrows
ax.annotate('', xy=(0.40, 0.70), xytext=(0.27, 0.70), arrowprops=dict(arrowstyle='->', lw=2))
ax.annotate('', xy=(0.75, 0.70), xytext=(0.62, 0.70), arrowprops=dict(arrowstyle='->', lw=2))

ax.text(0.515, 0.42, "Evaluate: ROC / PR / Calibration / Confusion Matrix", ha='center', fontsize=10)

plt.tight_layout()
plt.show()
