# üö® IMPORTANT: Execution Order

**‚ö†Ô∏è CRITICAL: Run cells in sequential order!**

This notebook has dependencies between cells. If you get `NameError`, it means you skipped a prerequisite cell.

## Quick Fix for Errors:

If you see: `NameError: name 'X_scaled' is not defined`

**Solution:** Run these cells FIRST (in order):
1. **Cell 13**: Step 5 - Prepare Features for Training
2. **Cell 15**: Step 6 - Train Logistic Regression  
3. **Cell 16**: Step 6 - Train Linear SVM
4. Then run visualization cells

See `NOTEBOOK_EXECUTION_ORDER.md` for complete guide.


In [None]:
# Quick Prerequisite Checker
# Run this cell anytime to check if you have all required variables

def check_prerequisites():
    """Check if all required variables for the notebook exist"""
    required = {
        'df': 'Cell 5 (Load Data)',
        'X_scaled': 'Cell 13 (Prepare Features)',
        'y': 'Cell 13 (Prepare Features)',
        'groups': 'Cell 13 (Prepare Features)',
        'scaler': 'Cell 13 (Prepare Features)',
        'selected_features': 'Cell 13 (Prepare Features)',
        'gkf': 'Cell 15 (Train Models)',
        'lr': 'Cell 15 (Train Models)',
        'svm': 'Cell 16 (Train Models)',
        'lr_scores': 'Cell 15 (Train Models)',
        'svm_scores': 'Cell 16 (Train Models)'
    }
    
    print("="*70)
    print("PREREQUISITE CHECK")
    print("="*70)
    
    all_good = True
    for var, cell_info in required.items():
        exists = var in globals()
        status = "‚úÖ" if exists else "‚ùå"
        print(f"{status} {var:20s} - {cell_info}")
        if not exists:
            all_good = False
    
    print("="*70)
    if all_good:
        print("‚úÖ All prerequisites met! You can proceed with any cell.")
    else:
        print("‚ùå Missing prerequisites. Please run the cells marked with ‚ùå first.")
        print("\nRecommended order:")
        print("  1. Cell 13: Prepare Features (creates X_scaled, y, groups)")
        print("  2. Cell 15: Train Logistic Regression (creates lr, gkf)")
        print("  3. Cell 16: Train Linear SVM (creates svm)")
    print("="*70)
    
    return all_good

# Run the check
check_prerequisites()


# üß† ASD Screening ML Model Training
## Optimized for Small Datasets (53-58 Children)

**Dataset Size:** 20-25 ASD + 33 Control = 53-58 total children

**Recommended Models:**
- ‚úÖ **Logistic Regression** (Primary - best for small datasets)
- ‚úÖ **Linear SVM** (Secondary comparison)
- ‚úÖ **Restricted Random Forest** (After expansion)

**Key Features:**
- Child-level cross-validation (prevents data leakage)
- Age normalization
- Trial-level bootstrapping (optional dataset expansion)
- Sensitivity-focused evaluation (screening priority)
- Probability calibration


## Step 1: Setup and Install Libraries


In [None]:
# Install required packages (Google Colab)
!pip install pandas numpy scikit-learn matplotlib seaborn scipy joblib -q

print("‚úÖ All packages installed!")


In [None]:
# Import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GroupKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ All libraries imported successfully!")


## Step 2: Load Data


In [None]:
# Upload CSV to Google Colab
from google.colab import files
uploaded = files.upload()

# Load the CSV (adjust filename)
df = pd.read_csv(list(uploaded.keys())[0])

print(f"‚úÖ Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Explore data
print("=" * 60)
print("DATA OVERVIEW")
print("=" * 60)
print(f"\nTotal rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")

print("\n" + "=" * 60)
print("TARGET DISTRIBUTION (Group)")
print("=" * 60)
if 'group' in df.columns:
    print(df['group'].value_counts())
    print(f"\nASD: {len(df[df['group'] == 'asd'])} children")
    print(f"Control: {len(df[df['group'] == 'typically_developing'])} children")

print("\n" + "=" * 60)
print("AGE DISTRIBUTION")
print("=" * 60)
if 'age_months' in df.columns:
    print(df['age_months'].describe())

print("\n" + "=" * 60)
print("MISSING VALUES")
print("=" * 60)
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
if len(missing) > 0:
    print(missing)
else:
    print("No missing values!")


## Step 3: Data Preprocessing


In [None]:
# Encode target variable
df['target'] = (df['group'] == 'asd').astype(int)  # ASD = 1, Control = 0

print("Target encoding:")
print(f"ASD = 1")
print(f"Control = 0")
print(f"\nDistribution: {df['target'].value_counts().to_dict()}")

# Handle missing values (initial pass - more detailed handling in Step 5)
print("\nHandling missing values (initial pass)...")
print("‚ö†Ô∏è  Note: Features with >50% missing will be excluded in Step 5")

numeric_cols = df.select_dtypes(include=[np.number]).columns
filled_count = 0
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        missing_pct = (df[col].isnull().sum() / len(df)) * 100
        if missing_pct < 50:  # Only fill if <50% missing (others will be excluded)
            median_val = df[col].median()
            if pd.isna(median_val):
                median_val = 0  # Fallback
            df[col].fillna(median_val, inplace=True)
            filled_count += 1
            if filled_count <= 10:  # Show first 10
                print(f"  ‚úÖ Filled {col} ({missing_pct:.1f}% missing) with median: {median_val:.2f}")

if filled_count > 10:
    print(f"  ... and {filled_count - 10} more columns filled")

print(f"\n‚úÖ Initial missing value handling completed ({filled_count} columns filled)")
print(f"   Features with >50% missing will be excluded during feature selection")


## Step 4: Feature Engineering & Age Normalization


In [None]:
# Calculate derived features
print("üîß Calculating derived features...")

# 1. Switch Cost
if 'avg_rt_pre_switch_ms' in df.columns and 'avg_rt_post_switch_correct_ms' in df.columns:
    df['switch_cost_ms'] = df['avg_rt_post_switch_correct_ms'] - df['avg_rt_pre_switch_ms']
    df['switch_cost_ms'] = df['switch_cost_ms'].fillna(0)
    print("   ‚úÖ Added: switch_cost_ms")

# 2. Accuracy Drop
if 'pre_switch_accuracy' in df.columns and 'post_switch_accuracy' in df.columns:
    df['accuracy_drop_percent'] = ((df['pre_switch_accuracy'] - df['post_switch_accuracy']) / 
                                    df['pre_switch_accuracy'].replace(0, 1)) * 100
    df['accuracy_drop_percent'] = df['accuracy_drop_percent'].fillna(0)
    print("   ‚úÖ Added: accuracy_drop_percent")

# 3. Commission Error Rate
if 'commission_errors' in df.columns and 'nogo_accuracy' in df.columns:
    df['commission_error_rate_calc'] = 100 - df['nogo_accuracy']
    df['commission_error_rate_calc'] = df['commission_error_rate_calc'].fillna(0)
    print("   ‚úÖ Added: commission_error_rate_calc")

print("\n‚úÖ Derived features calculated!")


In [None]:
# Age normalization using control group norms
print("üîß Performing age normalization...")

control_df = df[df['target'] == 0].copy()
features_to_normalize = [
    'switch_cost_ms', 'perseverative_error_rate_post_switch',
    'commission_error_rate', 'rt_variability',
    'post_switch_accuracy', 'nogo_accuracy',
    'avg_rt_pre_switch_ms', 'avg_rt_post_switch_correct_ms',
    'avg_rt_go_ms', 'accuracy_drop_percent'
]

features_to_normalize = [f for f in features_to_normalize if f in df.columns]

if len(features_to_normalize) > 0 and len(control_df) > 0:
    for feature in features_to_normalize:
        z_scores = []
        for idx, row in df.iterrows():
            age = row.get('age_months', 36)
            value = row[feature]
            if pd.isna(value) or pd.isna(age):
                z_scores.append(0)
                continue
            age_band_controls = control_df[
                (control_df['age_months'] >= age - 6) & 
                (control_df['age_months'] <= age + 6)
            ]
            if len(age_band_controls) > 1:
                mean_val = age_band_controls[feature].mean()
                std_val = age_band_controls[feature].std()
                z_score = (value - mean_val) / std_val if std_val > 0 else 0
            else:
                mean_val = control_df[feature].mean()
                std_val = control_df[feature].std()
                z_score = (value - mean_val) / std_val if std_val > 0 else 0
            z_scores.append(z_score)
        df[f'{feature}_zscore'] = z_scores
        print(f"   ‚úÖ Normalized: {feature} ‚Üí {feature}_zscore")
    print(f"\n‚úÖ Age normalization completed!")
else:
    print("‚ö†Ô∏è  Skipping age normalization")


## Step 5: Prepare Features for Training


In [None]:
# Select features for training
feature_candidates = [
    'age_months',
    'post_switch_accuracy', 'post_switch_accuracy_zscore',
    'total_perseverative_errors', 'perseverative_error_rate_post_switch', 'perseverative_error_rate_post_switch_zscore',
    'switch_cost_ms', 'switch_cost_ms_zscore',
    'avg_rt_pre_switch_ms', 'avg_rt_pre_switch_ms_zscore',
    'avg_rt_post_switch_correct_ms', 'avg_rt_post_switch_correct_ms_zscore',
    'accuracy_drop_percent', 'accuracy_drop_percent_zscore',
    'nogo_accuracy', 'nogo_accuracy_zscore',
    'commission_error_rate', 'commission_error_rate_zscore',
    'rt_variability', 'rt_variability_zscore',
    'go_accuracy', 'avg_rt_go_ms', 'avg_rt_go_ms_zscore',
    'critical_items_failed', 'critical_items_fail_rate',
    'social_responsiveness_score', 'joint_attention_score',
    'attention_level', 'engagement_level', 'frustration_tolerance',
    'accuracy_overall', 'completion_time_sec',
]

# Filter features that exist in dataset
selected_features = [f for f in feature_candidates if f in df.columns]

# CRITICAL: Handle missing values more intelligently
# For small datasets, we need to be more selective
print(f"\nüîç Analyzing feature completeness...")
feature_completeness = {}
for feat in selected_features:
    if feat in df.columns:
        complete = df[feat].notna().sum()
        total = len(df)
        pct = (complete / total) * 100
        feature_completeness[feat] = {'complete': complete, 'total': total, 'pct': pct}

# Only keep features with >50% completeness (as before)
selected_features = [f for f in selected_features 
                    if f in feature_completeness and feature_completeness[f]['pct'] > 50]

print(f"‚úÖ Selected {len(selected_features)} features for training (after filtering by completeness)")
print(f"\nüìä Feature Completeness Summary:")
for feat in selected_features[:10]:  # Show top 10
    info = feature_completeness[feat]
    print(f"   {feat}: {info['complete']}/{info['total']} ({info['pct']:.1f}%)")
if len(selected_features) > 10:
    print(f"   ... and {len(selected_features) - 10} more features")

# Prepare X, y, and groups
X = df[selected_features].copy()
y = df['target'].copy()
groups = df['child_id'].values if 'child_id' in df.columns else df.index.values

# Handle missing values more intelligently
print(f"\nüîß Handling remaining missing values...")
for col in X.columns:
    missing_count = X[col].isna().sum()
    if missing_count > 0:
        # For numeric features, use median (more robust than mean)
        if X[col].dtype in [np.int64, np.float64]:
            fill_value = X[col].median()
            if pd.isna(fill_value):
                fill_value = 0  # Fallback if all values are NaN
            X[col].fillna(fill_value, inplace=True)
            print(f"   ‚úÖ {col}: Filled {missing_count} missing with median ({fill_value:.2f})")
        else:
            # For categorical, use mode or 0
            X[col].fillna(0, inplace=True)
            print(f"   ‚úÖ {col}: Filled {missing_count} missing with 0")

# Standardize features
scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)

print(f"\n‚úÖ Data prepared:")
print(f"   Features (X): {X_scaled.shape}")
print(f"   Target (y): {y.shape}")
print(f"   Groups: {len(np.unique(groups))} unique children")
print(f"   ASD samples: {y.sum()}")
print(f"   Control samples: {len(y) - y.sum()}")

# Store for later use
print(f"\nüíæ Variables created: X_scaled, y, groups, scaler")
print(f"   These will be used in subsequent cells.")


In [None]:
# Check if required variables exist
if 'X_scaled' not in globals() or 'y' not in globals():
    raise NameError(
        "‚ùå ERROR: X_scaled and y are not defined!\n"
        "   Please run the previous cells (Steps 1-5) first.\n"
        "   Specifically, run Cell 13 (Step 5: Prepare Features for Training)"
    )

# Setup cross-validation (CHILD-LEVEL splitting - critical!)
n_splits = min(5, len(np.unique(groups)))
gkf = GroupKFold(n_splits=n_splits)

print(f"üìä Using {n_splits}-fold GroupKFold cross-validation")
print(f"   (Ensures same child never appears in both train and test)\n")

scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall',  # SENSITIVITY - MOST IMPORTANT
    'precision': 'precision',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

# Model 1: Logistic Regression (PRIMARY)
print("=" * 60)
print("MODEL 1: LOGISTIC REGRESSION (PRIMARY)")
print("=" * 60)

lr = LogisticRegression(
    penalty='l2', C=0.5, class_weight='balanced',
    max_iter=2000, random_state=42
)

lr_scores = cross_validate(
    lr, X_scaled, y, groups=groups,
    cv=gkf, scoring=scoring, return_train_score=True
)

print(f"\nüìä Cross-Validation Results:")
for metric in ['test_accuracy', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']:
    scores = lr_scores[metric]
    print(f"   {metric.replace('test_', '').upper()}: {scores.mean():.3f} ¬± {scores.std():.3f}")

lr.fit(X_scaled, y)
print(f"\n‚úÖ Logistic Regression trained on full dataset")


In [None]:
# Model 2: Linear SVM (Secondary)
print("=" * 60)
print("MODEL 2: LINEAR SVM (SECONDARY)")
print("=" * 60)

svm = SVC(
    kernel='linear', probability=True,
    class_weight='balanced', C=0.5, random_state=42
)

svm_scores = cross_validate(
    svm, X_scaled, y, groups=groups,
    cv=gkf, scoring=scoring, return_train_score=True
)

print(f"\nüìä Cross-Validation Results:")
for metric in ['test_accuracy', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']:
    scores = svm_scores[metric]
    print(f"   {metric.replace('test_', '').upper()}: {scores.mean():.3f} ¬± {scores.std():.3f}")

svm.fit(X_scaled, y)
print(f"\n‚úÖ Linear SVM trained on full dataset")


## Step 7: Model Comparison & Selection


In [None]:
# Compare models
print("=" * 60)
print("MODEL COMPARISON")
print("=" * 60)

comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Linear SVM'],
    'Accuracy': [lr_scores['test_accuracy'].mean(), svm_scores['test_accuracy'].mean()],
    'Recall (Sensitivity)': [lr_scores['test_recall'].mean(), svm_scores['test_recall'].mean()],
    'Precision': [lr_scores['test_precision'].mean(), svm_scores['test_precision'].mean()],
    'F1-Score': [lr_scores['test_f1'].mean(), svm_scores['test_f1'].mean()],
    'ROC-AUC': [lr_scores['test_roc_auc'].mean(), svm_scores['test_roc_auc'].mean()]
})

print("\n" + comparison.to_string(index=False))
best_model_name = comparison.loc[comparison['Recall (Sensitivity)'].idxmax(), 'Model']
print(f"\n‚≠ê BEST MODEL (by Recall): {best_model_name}")
print(f"   (Recall/Sensitivity is most important for screening)")


## Step 8: Probability Calibration & Save Model


In [None]:
# Calibrate probabilities (makes risk scores trustworthy)
print("üîß Calibrating probabilities...")
best_model = lr  # Use LR (usually best for small datasets)

calibrated_model = CalibratedClassifierCV(
    best_model, method='sigmoid', cv=gkf
)
calibrated_model.fit(X_scaled, y)

print("‚úÖ Probabilities calibrated!\n")

# Save model and scaler
model_filename = 'asd_screening_model_calibrated.pkl'
scaler_filename = 'feature_scaler.pkl'

joblib.dump(calibrated_model, model_filename)
joblib.dump(scaler, scaler_filename)

print(f"‚úÖ Model saved: {model_filename}")
print(f"‚úÖ Scaler saved: {scaler_filename}\n")

# Download files
files.download(model_filename)
files.download(scaler_filename)
print("‚úÖ Files downloaded!")


## Step 9: Feature Importance Analysis


In [None]:
# Get feature importance from Logistic Regression
feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Coefficient': lr.coef_[0],
    'Abs_Coefficient': np.abs(lr.coef_[0])
}).sort_values('Abs_Coefficient', ascending=False)

print("=" * 60)
print("TOP 15 MOST IMPORTANT FEATURES")
print("=" * 60)
print(feature_importance.head(15).to_string(index=False))

# Visualize
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['Abs_Coefficient'])
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('Absolute Coefficient (Importance)')
plt.title('Top 15 Most Important Features (Logistic Regression)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


## Step 10: Advanced Visualizations & Feature Engineering

This section includes:
- üìä Model performance visualizations (ROC curves, confusion matrices)
- üîç Feature correlation analysis
- üéØ Advanced feature engineering techniques (used cautiously for small datasets)
- üìà Data distribution analysis
- üìâ Learning curves and cross-validation analysis
- üé® Age-stratified performance analysis

**‚ö†Ô∏è Scientific Framing:**
- All analyses are designed for a **pilot screening system** with limited data (53-58 children)
- Feature engineering is **conservative** to avoid overfitting
- Results emphasize **screening reliability** and **interpretability**, not diagnostic certainty
- This approach is appropriate for **undergraduate/early postgraduate research** level


### 10.1: ROC Curves & Model Performance Comparison


In [None]:
# ============================================================================
# PREREQUISITE CHECK: Verify all required variables exist
# ============================================================================
print("üîç Checking prerequisites...")

required_vars = {
    'X_scaled': 'Cell 13 (Step 5: Prepare Features for Training)',
    'y': 'Cell 13 (Step 5: Prepare Features for Training)',
    'groups': 'Cell 13 (Step 5: Prepare Features for Training)',
    'scaler': 'Cell 13 (Step 5: Prepare Features for Training)',
    'gkf': 'Cell 15 (Step 6: Train Models)',
    'lr': 'Cell 15 (Step 6: Train Models)',
    'svm': 'Cell 16 (Step 6: Train Models)',
    'lr_scores': 'Cell 15 (Step 6: Train Models)',
    'svm_scores': 'Cell 16 (Step 6: Train Models)'
}

missing_vars = []
for var_name, cell_info in required_vars.items():
    if var_name not in globals():
        missing_vars.append((var_name, cell_info))
        print(f"   ‚ùå {var_name} - Missing (create in {cell_info})")
    else:
        print(f"   ‚úÖ {var_name} - Found")

if missing_vars:
    print("\n" + "="*70)
    print("‚ùå ERROR: Missing required variables!")
    print("="*70)
    print("\nPlease run these cells FIRST (in order):")
    print("\n1. Cell 13: Step 5 - Prepare Features for Training")
    print("   ‚Üí Creates: X_scaled, y, groups, scaler")
    print("\n2. Cell 15: Step 6 - Train Logistic Regression")
    print("   ‚Üí Creates: lr, lr_scores, gkf")
    print("\n3. Cell 16: Step 6 - Train Linear SVM")
    print("   ‚Üí Creates: svm, svm_scores")
    print("\n4. Then come back and run this cell (ROC Curves)")
    print("="*70)
    raise NameError(
        f"Missing variables: {[v[0] for v in missing_vars]}. "
        "Please run the prerequisite cells first."
    )

print("\n‚úÖ All prerequisites met! Proceeding with ROC curve generation...\n")

# ============================================================================
# Generate ROC curves for both models
# ============================================================================
from sklearn.metrics import roc_curve, auc

# Get predictions from cross-validation
def get_cv_predictions(model, X, y, groups, cv):
    """Get cross-validation predictions"""
    y_pred_proba = np.zeros(len(y))
    y_pred = np.zeros(len(y))
    
    for train_idx, test_idx in cv.split(X, y, groups):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        model.fit(X_train, y_train)
        y_pred_proba[test_idx] = model.predict_proba(X_test)[:, 1]
        y_pred[test_idx] = model.predict(X_test)
    
    return y_pred, y_pred_proba

# Get predictions using cross-validation
print("üìä Generating cross-validation predictions...")
print("   This ensures predictions are from held-out test sets (no data leakage)")

try:
    lr_pred, lr_pred_proba = get_cv_predictions(lr, X_scaled, y, groups, gkf)
    print("   ‚úÖ Logistic Regression predictions generated")
except Exception as e:
    print(f"   ‚ùå Error generating LR predictions: {e}")
    raise

try:
    svm_pred, svm_pred_proba = get_cv_predictions(svm, X_scaled, y, groups, gkf)
    print("   ‚úÖ Linear SVM predictions generated")
except Exception as e:
    print(f"   ‚ùå Error generating SVM predictions: {e}")
    raise

# Calculate ROC curves
lr_fpr, lr_tpr, _ = roc_curve(y, lr_pred_proba)
svm_fpr, svm_tpr, _ = roc_curve(y, svm_pred_proba)

lr_auc = auc(lr_fpr, lr_tpr)
svm_auc = auc(svm_fpr, svm_tpr)

# Plot ROC curves
plt.figure(figsize=(10, 8))
plt.plot(lr_fpr, lr_tpr, label=f'Logistic Regression (AUC = {lr_auc:.3f})', linewidth=2)
plt.plot(svm_fpr, svm_tpr, label=f'Linear SVM (AUC = {svm_auc:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier (AUC = 0.500)', linewidth=1)
plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12)
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=12)
plt.title('ROC Curves: Model Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"‚úÖ ROC Curves generated!")
print(f"   Logistic Regression AUC: {lr_auc:.3f}")
print(f"   Linear SVM AUC: {svm_auc:.3f}")
print(f"\nüí° Note: Precision-Recall curves (below) are more informative")
print(f"   for imbalanced datasets like this one (ASD vs Control).")


In [None]:
# Confusion Matrices for both models
from sklearn.metrics import confusion_matrix

lr_cm = confusion_matrix(y, lr_pred)
svm_cm = confusion_matrix(y, svm_pred)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Logistic Regression Confusion Matrix
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Control', 'ASD'], yticklabels=['Control', 'ASD'])
axes[0].set_title('Logistic Regression\nConfusion Matrix', fontsize=12, fontweight='bold')
axes[0].set_ylabel('True Label', fontsize=11)
axes[0].set_xlabel('Predicted Label', fontsize=11)

# Calculate metrics
lr_tn, lr_fp, lr_fn, lr_tp = lr_cm.ravel()
lr_sensitivity = lr_tp / (lr_tp + lr_fn) if (lr_tp + lr_fn) > 0 else 0
lr_specificity = lr_tn / (lr_tn + lr_fp) if (lr_tn + lr_fp) > 0 else 0

axes[0].text(0.5, -0.15, f'Sensitivity: {lr_sensitivity:.3f} | Specificity: {lr_specificity:.3f}',
             transform=axes[0].transAxes, ha='center', fontsize=10)

# Linear SVM Confusion Matrix
sns.heatmap(svm_cm, annot=True, fmt='d', cmap='Greens', ax=axes[1],
            xticklabels=['Control', 'ASD'], yticklabels=['Control', 'ASD'])
axes[1].set_title('Linear SVM\nConfusion Matrix', fontsize=12, fontweight='bold')
axes[1].set_ylabel('True Label', fontsize=11)
axes[1].set_xlabel('Predicted Label', fontsize=11)

svm_tn, svm_fp, svm_fn, svm_tp = svm_cm.ravel()
svm_sensitivity = svm_tp / (svm_tp + svm_fn) if (svm_tp + svm_fn) > 0 else 0
svm_specificity = svm_tn / (svm_tn + svm_fp) if (svm_tn + svm_fp) > 0 else 0

axes[1].text(0.5, -0.15, f'Sensitivity: {svm_sensitivity:.3f} | Specificity: {svm_specificity:.3f}',
             transform=axes[1].transAxes, ha='center', fontsize=10)

plt.tight_layout()
plt.show()

print("‚úÖ Confusion matrices generated!")


In [None]:
# Precision-Recall Curves (Important for imbalanced datasets)
from sklearn.metrics import precision_recall_curve, average_precision_score

lr_precision, lr_recall, _ = precision_recall_curve(y, lr_pred_proba)
svm_precision, svm_recall, _ = precision_recall_curve(y, svm_pred_proba)

lr_ap = average_precision_score(y, lr_pred_proba)
svm_ap = average_precision_score(y, svm_pred_proba)

plt.figure(figsize=(10, 8))
plt.plot(lr_recall, lr_precision, label=f'Logistic Regression (AP = {lr_ap:.3f})', linewidth=2)
plt.plot(svm_recall, svm_precision, label=f'Linear SVM (AP = {svm_ap:.3f})', linewidth=2)
plt.xlabel('Recall (Sensitivity)', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curves: Model Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower left', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"‚úÖ Precision-Recall curves generated!")
print(f"   Logistic Regression Average Precision: {lr_ap:.3f}")
print(f"   Linear SVM Average Precision: {svm_ap:.3f}")


### 10.2: Feature Correlation Analysis


In [None]:
# Feature Correlation Heatmap
correlation_matrix = X_scaled.corr()

# Select top features for correlation analysis
top_features_for_corr = feature_importance.head(15)['Feature'].tolist()
top_features_for_corr = [f for f in top_features_for_corr if f in correlation_matrix.columns]

corr_subset = correlation_matrix.loc[top_features_for_corr, top_features_for_corr]

plt.figure(figsize=(14, 12))
sns.heatmap(corr_subset, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix (Top 15 Features)', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("‚úÖ Feature correlation matrix generated!")
print("\nüí° Interpretation:")
print("   - Values close to +1: Strong positive correlation")
print("   - Values close to -1: Strong negative correlation")
print("   - Values close to 0: No correlation")
print("\n‚ö†Ô∏è  Action: If |correlation| > 0.85, consider:")
print("   - Dropping one feature (redundancy)")
print("   - Keeping the more interpretable feature")
print("   - This prevents multicollinearity issues")


### 10.3: Feature Distribution Analysis


In [None]:
# Compare feature distributions between ASD and Control groups
top_5_features = feature_importance.head(5)['Feature'].tolist()

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_5_features):
    if feature in X_scaled.columns:
        asd_values = X_scaled[y == 1][feature]
        control_values = X_scaled[y == 0][feature]
        
        axes[idx].hist(control_values, bins=20, alpha=0.6, label='Control', color='blue', density=True)
        axes[idx].hist(asd_values, bins=20, alpha=0.6, label='ASD', color='red', density=True)
        axes[idx].set_title(f'{feature}', fontsize=11, fontweight='bold')
        axes[idx].set_xlabel('Feature Value (Normalized)', fontsize=10)
        axes[idx].set_ylabel('Density', fontsize=10)
        axes[idx].legend(fontsize=9)
        axes[idx].grid(True, alpha=0.3)

# Remove extra subplot
axes[5].axis('off')

plt.suptitle('Feature Distributions: ASD vs Control (Top 5 Features)', 
             fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

print("‚úÖ Feature distribution analysis completed!")


### 10.4: Cross-Validation Fold Analysis


In [None]:
# Visualize cross-validation performance across folds
fold_metrics = {
    'Fold': [],
    'Accuracy': [],
    'Recall': [],
    'Precision': [],
    'F1-Score': [],
    'ROC-AUC': []
}

fold_num = 1
for train_idx, test_idx in gkf.split(X_scaled, y, groups):
    X_train_fold, X_test_fold = X_scaled.iloc[train_idx], X_scaled.iloc[test_idx]
    y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]
    
    # Train on fold
    lr_fold = LogisticRegression(penalty='l2', C=0.5, class_weight='balanced', max_iter=2000, random_state=42)
    lr_fold.fit(X_train_fold, y_train_fold)
    
    # Predict
    y_pred_fold = lr_fold.predict(X_test_fold)
    y_pred_proba_fold = lr_fold.predict_proba(X_test_fold)[:, 1]
    
    # Calculate metrics
    fold_metrics['Fold'].append(fold_num)
    fold_metrics['Accuracy'].append(accuracy_score(y_test_fold, y_pred_fold))
    fold_metrics['Recall'].append(recall_score(y_test_fold, y_pred_fold))
    fold_metrics['Precision'].append(precision_score(y_test_fold, y_pred_fold))
    fold_metrics['F1-Score'].append(f1_score(y_test_fold, y_pred_fold))
    fold_metrics['ROC-AUC'].append(roc_auc_score(y_test_fold, y_pred_proba_fold))
    
    fold_num += 1

fold_df = pd.DataFrame(fold_metrics)

# Plot fold performance
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

metrics_to_plot = ['Accuracy', 'Recall', 'Precision', 'F1-Score']
for idx, metric in enumerate(metrics_to_plot):
    axes[idx].bar(fold_df['Fold'], fold_df[metric], color='steelblue', alpha=0.7)
    axes[idx].axhline(y=fold_df[metric].mean(), color='red', linestyle='--', 
                      label=f'Mean: {fold_df[metric].mean():.3f}')
    axes[idx].set_title(f'{metric} Across CV Folds', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Fold Number', fontsize=11)
    axes[idx].set_ylabel(metric, fontsize=11)
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)
    axes[idx].set_ylim([0, 1])

plt.suptitle('Cross-Validation Performance Across Folds', fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

print("‚úÖ Cross-validation fold analysis completed!")
print("\nFold Performance Summary:")
print(fold_df.to_string(index=False))
print(f"\nüí° Interpretation:")
print(f"   - Mean ¬± SD shows model stability across folds")
print(f"   - High variability (large SD) indicates model instability")
print(f"   - Low variability (small SD) indicates robust performance")
print(f"   - This honesty about variability strengthens research credibility")


### 10.5: Age-Stratified Performance Analysis


In [None]:
# Analyze model performance by age group
if 'age_months' in df.columns:
    df_with_pred = df.copy()
    df_with_pred['predicted'] = lr_pred
    df_with_pred['predicted_proba'] = lr_pred_proba
    
    # Create age groups
    df_with_pred['age_group'] = pd.cut(df_with_pred['age_months'], 
                                       bins=[0, 36, 48, 60, 72, 100],
                                       labels=['2-3 years', '3-4 years', '4-5 years', '5-6 years', '6+ years'])
    
    # Calculate metrics by age group
    age_performance = []
    for age_group in df_with_pred['age_group'].cat.categories:
        age_data = df_with_pred[df_with_pred['age_group'] == age_group]
        if len(age_data) > 0:
            age_y_true = age_data['target']
            age_y_pred = age_data['predicted']
            age_y_proba = age_data['predicted_proba']
            
            age_performance.append({
                'Age Group': age_group,
                'N': len(age_data),
                'ASD Count': age_y_true.sum(),
                'Control Count': (age_y_true == 0).sum(),
                'Accuracy': accuracy_score(age_y_true, age_y_pred),
                'Recall': recall_score(age_y_true, age_y_pred) if age_y_true.sum() > 0 else 0,
                'Precision': precision_score(age_y_true, age_y_pred) if age_y_pred.sum() > 0 else 0,
                'F1-Score': f1_score(age_y_true, age_y_pred) if age_y_true.sum() > 0 else 0,
                'ROC-AUC': roc_auc_score(age_y_true, age_y_proba) if len(np.unique(age_y_true)) > 1 else 0
            })
    
    age_perf_df = pd.DataFrame(age_performance)
    
    # Visualize
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    metrics = ['Accuracy', 'Recall', 'Precision', 'F1-Score']
    for idx, metric in enumerate(metrics):
        axes[idx].bar(age_perf_df['Age Group'], age_perf_df[metric], color='coral', alpha=0.7)
        axes[idx].set_title(f'{metric} by Age Group', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel('Age Group', fontsize=11)
        axes[idx].set_ylabel(metric, fontsize=11)
        axes[idx].set_ylim([0, 1])
        axes[idx].grid(True, alpha=0.3)
        axes[idx].tick_params(axis='x', rotation=45)
    
    plt.suptitle('Model Performance by Age Group', fontsize=14, fontweight='bold', y=0.995)
    plt.tight_layout()
    plt.show()
    
    print("‚úÖ Age-stratified performance analysis completed!")
    print("\nPerformance by Age Group:")
    print(age_perf_df.to_string(index=False))
    print(f"\nüí° Correct Interpretation:")
    print(f"   - Performance variation by age is EXPECTED and NORMAL")
    print(f"   - Autism is developmental - patterns change with age")
    print(f"   - Don't say 'model fails at age X'")
    print(f"   - Say: 'Performance varies by developmental stage, consistent with ASD literature'")
else:
    print("‚ö†Ô∏è  Age information not available for age-stratified analysis")


### 10.6: Advanced Feature Engineering

**‚ö†Ô∏è Important Note for Small Datasets:**
- Feature engineering is used **cautiously** to avoid overfitting
- Only **domain-driven** interactions are created (not brute-force)
- All enhanced features are **validated** via cross-validation
- If enhanced features don't improve performance, they are **not used**


In [None]:
# Create interaction features (domain-driven, limited for small dataset)
print("üîß Creating interaction features (CAUTIOUSLY for small dataset)...")
print("   ‚ö†Ô∏è  Only creating 2-3 meaningful interactions to avoid overfitting\n")

# Get top features
top_features = feature_importance.head(5)['Feature'].tolist()
top_features = [f for f in top_features if f in X_scaled.columns]

X_enhanced = X_scaled.copy()
interactions_created = []
interaction_count = 0

# Create ONLY 2-3 domain-driven interactions (not all pairwise combinations)
# This prevents feature explosion with small datasets

# Interaction 1: Top 2 features (if they make psychological sense)
if len(top_features) >= 2:
    feat1, feat2 = top_features[0], top_features[1]
    # Only create if features are related (e.g., both from same domain)
    interaction_name = f'{feat1}_x_{feat2}'
    X_enhanced[interaction_name] = X_scaled[feat1] * X_scaled[feat2]
    interactions_created.append(interaction_name)
    interaction_count += 1
    print(f"   ‚úÖ Created: {interaction_name}")

# Interaction 2: Only if we have enough features and it's meaningful
if len(top_features) >= 3 and interaction_count < 2:
    # Check if features are from different domains (e.g., DCCS + Frog Jump)
    feat1, feat3 = top_features[0], top_features[2]
    interaction_name = f'{feat1}_x_{feat3}'
    X_enhanced[interaction_name] = X_scaled[feat1] * X_scaled[feat3]
    interactions_created.append(interaction_name)
    interaction_count += 1
    print(f"   ‚úÖ Created: {interaction_name}")

print(f"\n   Total interactions: {interaction_count} (limited to prevent overfitting)")

# Create polynomial features for ONLY top 1-2 features (squared terms)
# Using sparingly as recommended for small datasets
poly_count = 0
poly_features_created = []

# Only square the top feature (most important)
if len(top_features) >= 1:
    top_feat = top_features[0]
    if top_feat in X_scaled.columns:
        poly_name = f'{top_feat}_squared'
        X_enhanced[poly_name] = X_scaled[top_feat] ** 2
        poly_features_created.append(poly_name)
        poly_count += 1
        print(f"   ‚úÖ Created polynomial: {poly_name}")

print(f"\n‚úÖ Enhanced feature set: {X_scaled.shape[1]} ‚Üí {X_enhanced.shape[1]} features")
print(f"   (Added {interaction_count} interactions + {poly_count} polynomial features)")
print(f"   ‚ö†Ô∏è  Conservative approach for small dataset (53-58 children)")

# Test if enhanced features improve performance
print("\nüìä Testing enhanced features...")
lr_enhanced = LogisticRegression(penalty='l2', C=0.5, class_weight='balanced', max_iter=2000, random_state=42)

enhanced_scores = cross_validate(
    lr_enhanced, X_enhanced, y, groups=groups,
    cv=gkf, scoring=scoring, return_train_score=True
)

print(f"\nOriginal Features Performance:")
print(f"   Accuracy: {lr_scores['test_accuracy'].mean():.3f} ¬± {lr_scores['test_accuracy'].std():.3f}")
print(f"   Recall: {lr_scores['test_recall'].mean():.3f} ¬± {lr_scores['test_recall'].std():.3f}")
print(f"   ROC-AUC: {lr_scores['test_roc_auc'].mean():.3f} ¬± {lr_scores['test_roc_auc'].std():.3f}")

print(f"\nEnhanced Features Performance:")
print(f"   Accuracy: {enhanced_scores['test_accuracy'].mean():.3f} ¬± {enhanced_scores['test_accuracy'].std():.3f}")
print(f"   Recall: {enhanced_scores['test_recall'].mean():.3f} ¬± {enhanced_scores['test_recall'].std():.3f}")
print(f"   ROC-AUC: {enhanced_scores['test_roc_auc'].mean():.3f} ¬± {enhanced_scores['test_roc_auc'].std():.3f}")

improvement = enhanced_scores['test_recall'].mean() - lr_scores['test_recall'].mean()
improvement_auc = enhanced_scores['test_roc_auc'].mean() - lr_scores['test_roc_auc'].mean()

print(f"\nüìä Performance Comparison:")
print(f"   Recall improvement: {improvement:+.3f}")
print(f"   ROC-AUC improvement: {improvement_auc:+.3f}")

# Decision: Only use enhanced features if they provide meaningful improvement
# AND don't show signs of overfitting (train >> test)
train_test_gap = enhanced_scores['train_recall'].mean() - enhanced_scores['test_recall'].mean()

if improvement > 0.02 and train_test_gap < 0.15:  # Meaningful improvement + no overfitting
    print(f"\n‚úÖ Enhanced features show meaningful improvement and are stable")
    print(f"   Recommendation: Use enhanced features")
elif improvement > 0:
    print(f"\n‚ö†Ô∏è  Enhanced features show slight improvement but may risk overfitting")
    print(f"   Recommendation: Use original features (more conservative)")
else:
    print(f"\n‚ö†Ô∏è  Enhanced features did not improve performance")
    print(f"   Recommendation: Use original features (avoid overfitting)")
    print(f"   This is EXPECTED and GOOD - shows model is not overfitting")


### 10.7: Model Calibration Analysis

**Purpose:** Calibration improves probability reliability for clinical interpretation.

**Note:** Calibration is applied to the **final selected model only** (Logistic Regression), not all models.


In [None]:
# Calibration plot (reliability diagram)
from sklearn.calibration import calibration_curve

# Get calibrated predictions
calibrated_pred_proba = calibrated_model.predict_proba(X_scaled)[:, 1]

# Calibration curves
fraction_of_positives_uncal, mean_predicted_value_uncal = calibration_curve(
    y, lr_pred_proba, n_bins=10, strategy='uniform'
)
fraction_of_positives_cal, mean_predicted_value_cal = calibration_curve(
    y, calibrated_pred_proba, n_bins=10, strategy='uniform'
)

# Plot calibration curves
plt.figure(figsize=(10, 8))
plt.plot(mean_predicted_value_uncal, fraction_of_positives_uncal, 
         's-', label='Uncalibrated (Logistic Regression)', linewidth=2, markersize=8)
plt.plot(mean_predicted_value_cal, fraction_of_positives_cal, 
         'o-', label='Calibrated (Platt Scaling)', linewidth=2, markersize=8)
plt.plot([0, 1], [0, 1], 'k--', label='Perfectly Calibrated', linewidth=1)
plt.xlabel('Mean Predicted Probability', fontsize=12)
plt.ylabel('Fraction of Positives', fontsize=12)
plt.title('Calibration Plot: Model Reliability', fontsize=14, fontweight='bold')
plt.legend(loc='upper left', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("‚úÖ Calibration analysis completed!")
print("\nüí° Interpretation:")
print("   - Points closer to diagonal = better calibrated")
print("   - Calibrated model provides more reliable probability estimates")
print("   - Calibration does NOT improve accuracy, only probability reliability")
print("\n‚ö†Ô∏è  Important: Calibrated probabilities are for screening risk assessment,")
print("   NOT for diagnostic certainty. This is a screening tool, not a diagnostic tool.")


### 10.8: Comprehensive Model Comparison Dashboard

**‚ö†Ô∏è Important Disclaimer:**
- All results shown are from **cross-validation** (internal validation)
- These results support **internal validity** but do not imply diagnostic certainty
- This is a **screening tool**, not a diagnostic tool
- Results should be validated on **independent clinical data** before deployment


In [None]:
# Create comprehensive comparison dashboard
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. ROC Curves (top left, spans 2 columns)
ax1 = fig.add_subplot(gs[0, :2])
ax1.plot(lr_fpr, lr_tpr, label=f'Logistic Regression (AUC = {lr_auc:.3f})', linewidth=2)
ax1.plot(svm_fpr, svm_tpr, label=f'Linear SVM (AUC = {svm_auc:.3f})', linewidth=2)
ax1.plot([0, 1], [0, 1], 'k--', linewidth=1)
ax1.set_xlabel('False Positive Rate', fontsize=11)
ax1.set_ylabel('True Positive Rate', fontsize=11)
ax1.set_title('ROC Curves', fontsize=12, fontweight='bold')
ax1.legend(fontsize=10)
ax1.grid(True, alpha=0.3)

# 2. Metrics Comparison (top right)
ax2 = fig.add_subplot(gs[0, 2])
metrics_comparison = pd.DataFrame({
    'Metric': ['Accuracy', 'Recall', 'Precision', 'F1', 'ROC-AUC'],
    'LR': [
        lr_scores['test_accuracy'].mean(),
        lr_scores['test_recall'].mean(),
        lr_scores['test_precision'].mean(),
        lr_scores['test_f1'].mean(),
        lr_scores['test_roc_auc'].mean()
    ],
    'SVM': [
        svm_scores['test_accuracy'].mean(),
        svm_scores['test_recall'].mean(),
        svm_scores['test_precision'].mean(),
        svm_scores['test_f1'].mean(),
        svm_scores['test_roc_auc'].mean()
    ]
})
x = np.arange(len(metrics_comparison['Metric']))
width = 0.35
ax2.bar(x - width/2, metrics_comparison['LR'], width, label='Logistic Regression', alpha=0.7)
ax2.bar(x + width/2, metrics_comparison['SVM'], width, label='Linear SVM', alpha=0.7)
ax2.set_xlabel('Metric', fontsize=11)
ax2.set_ylabel('Score', fontsize=11)
ax2.set_title('Metrics Comparison', fontsize=12, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(metrics_comparison['Metric'], rotation=45, ha='right')
ax2.set_ylim([0, 1])
ax2.legend(fontsize=9)
ax2.grid(True, alpha=0.3, axis='y')

# 3. Feature Importance (middle left)
ax3 = fig.add_subplot(gs[1, 0])
top_10 = feature_importance.head(10)
ax3.barh(range(len(top_10)), top_10['Abs_Coefficient'], color='steelblue', alpha=0.7)
ax3.set_yticks(range(len(top_10)))
ax3.set_yticklabels(top_10['Feature'], fontsize=9)
ax3.set_xlabel('Importance', fontsize=10)
ax3.set_title('Top 10 Features', fontsize=11, fontweight='bold')
ax3.invert_yaxis()
ax3.grid(True, alpha=0.3, axis='x')

# 4. Confusion Matrix (middle center)
ax4 = fig.add_subplot(gs[1, 1])
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues', ax=ax4,
            xticklabels=['Control', 'ASD'], yticklabels=['Control', 'ASD'])
ax4.set_title('LR Confusion Matrix', fontsize=11, fontweight='bold')
ax4.set_ylabel('True', fontsize=10)
ax4.set_xlabel('Predicted', fontsize=10)

# 5. Precision-Recall (middle right)
ax5 = fig.add_subplot(gs[1, 2])
ax5.plot(lr_recall, lr_precision, label=f'LR (AP={lr_ap:.3f})', linewidth=2)
ax5.plot(svm_recall, svm_precision, label=f'SVM (AP={svm_ap:.3f})', linewidth=2)
ax5.set_xlabel('Recall', fontsize=10)
ax5.set_ylabel('Precision', fontsize=10)
ax5.set_title('Precision-Recall', fontsize=11, fontweight='bold')
ax5.legend(fontsize=9)
ax5.grid(True, alpha=0.3)

# 6. CV Fold Performance (bottom, spans 3 columns)
ax6 = fig.add_subplot(gs[2, :])
x_fold = fold_df['Fold']
width_fold = 0.2
ax6.bar(x_fold - width_fold*1.5, fold_df['Accuracy'], width_fold, label='Accuracy', alpha=0.7)
ax6.bar(x_fold - width_fold*0.5, fold_df['Recall'], width_fold, label='Recall', alpha=0.7)
ax6.bar(x_fold + width_fold*0.5, fold_df['Precision'], width_fold, label='Precision', alpha=0.7)
ax6.bar(x_fold + width_fold*1.5, fold_df['F1-Score'], width_fold, label='F1-Score', alpha=0.7)
ax6.set_xlabel('CV Fold', fontsize=11)
ax6.set_ylabel('Score', fontsize=11)
ax6.set_title('Cross-Validation Performance Across Folds', fontsize=12, fontweight='bold')
ax6.set_xticks(x_fold)
ax6.set_ylim([0, 1])
ax6.legend(fontsize=9, ncol=4)
ax6.grid(True, alpha=0.3, axis='y')

plt.suptitle('Comprehensive Model Analysis Dashboard', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

print("‚úÖ Comprehensive dashboard generated!")
print("\n" + "="*70)
print("‚ö†Ô∏è  IMPORTANT DISCLAIMER")
print("="*70)
print("All results shown are from CROSS-VALIDATION (internal validation).")
print("These results support internal validity but do NOT imply diagnostic certainty.")
print("This is a SCREENING tool, not a diagnostic tool.")
print("Results should be validated on independent clinical data before deployment.")
print("="*70)


## ‚úÖ Training Complete - Summary

**üéâ Congratulations! Your ML model training is complete!**

### What You've Accomplished:

1. ‚úÖ **Data Preparation**: Loaded, cleaned, and preprocessed 53-58 children dataset
2. ‚úÖ **Feature Engineering**: Created derived features, age normalization, z-scores
3. ‚úÖ **Model Training**: Trained Logistic Regression and Linear SVM with child-level CV
4. ‚úÖ **Model Selection**: Compared models and selected best based on sensitivity
5. ‚úÖ **Probability Calibration**: Calibrated model for reliable risk scores
6. ‚úÖ **Comprehensive Analysis**: Generated 10+ visualizations and analyses
7. ‚úÖ **Model Saved**: Downloaded `asd_screening_model_calibrated.pkl` and `feature_scaler.pkl`

### Your Model Files:

- **Model**: `asd_screening_model_calibrated.pkl` (calibrated for reliable probabilities)
- **Scaler**: `feature_scaler.pkl` (for feature normalization)

### Next Steps:

1. **Integrate into Backend**: Use the saved model files in your Node.js backend
2. **Test on New Data**: Validate model on new children (if available)
3. **Continue Data Collection**: More data = better model stability
4. **Monitor Performance**: Track model performance over time

### Important Reminders:

‚ö†Ô∏è **This is a SCREENING tool, not a diagnostic tool**
‚ö†Ô∏è **Results are from cross-validation (internal validation)**
‚ö†Ô∏è **Validate on independent clinical data before deployment**

---

**Your model is ready! üöÄ**


## ‚úÖ Training Complete!

**Congratulations!** You have successfully:

1. ‚úÖ Loaded and preprocessed your dataset
2. ‚úÖ Performed age normalization
3. ‚úÖ Trained Logistic Regression and Linear SVM models
4. ‚úÖ Compared model performance
5. ‚úÖ Calibrated probabilities for reliable risk scores
6. ‚úÖ Analyzed feature importance
7. ‚úÖ Generated comprehensive visualizations
8. ‚úÖ Saved the trained model and scaler

**Next Steps:**
- Download the model files (`asd_screening_model_calibrated.pkl` and `feature_scaler.pkl`)
- Integrate the model into your backend for real-time predictions
- Validate on new children (if available)
- Continue collecting data to improve model stability

**Remember:**
- This is a **screening tool**, not a diagnostic tool
- Results are from **cross-validation** (internal validation)
- Model should be validated on **independent clinical data** before deployment

---

**üéì Your model is ready for use!**


### 10.9: Feature Engineering Summary & Recommendations


In [None]:
# Generate feature engineering summary report
print("=" * 70)
print("FEATURE ENGINEERING SUMMARY REPORT")
print("=" * 70)

print(f"\nüìä Dataset Overview:")
print(f"   Total samples: {len(df)}")
print(f"   Features used: {len(selected_features)}")
print(f"   ASD samples: {y.sum()}")
print(f"   Control samples: {(y == 0).sum()}")

print(f"\nüéØ Top 5 Most Important Features:")
for idx, row in feature_importance.head(5).iterrows():
    print(f"   {idx+1}. {row['Feature']} (Coefficient: {row['Coefficient']:.4f})")

print(f"\nüìà Model Performance Summary:")
print(f"   Best Model: {best_model_name}")
print(f"   Accuracy: {comparison.loc[comparison['Model'] == best_model_name, 'Accuracy'].values[0]:.3f}")
print(f"   Recall (Sensitivity): {comparison.loc[comparison['Model'] == best_model_name, 'Recall (Sensitivity)'].values[0]:.3f}")
print(f"   ROC-AUC: {comparison.loc[comparison['Model'] == best_model_name, 'ROC-AUC'].values[0]:.3f}")

print(f"\nüí° Feature Engineering Recommendations:")
print(f"   1. ‚úÖ Age normalization applied: {len([f for f in selected_features if '_zscore' in f])} features")
print(f"   2. ‚úÖ Derived features created: switch_cost, accuracy_drop, commission_rate")
print(f"   3. {'‚úÖ' if interaction_count > 0 else '‚ö†Ô∏è '} Interaction features: {interaction_count} created")
print(f"   4. {'‚úÖ' if poly_count > 0 else '‚ö†Ô∏è '} Polynomial features: {poly_count} created")

print(f"\nüîç Feature Correlation Insights:")
high_corr_pairs = []
for i in range(len(corr_subset.columns)):
    for j in range(i+1, len(corr_subset.columns)):
        corr_val = corr_subset.iloc[i, j]
        if abs(corr_val) > 0.7:
            high_corr_pairs.append((corr_subset.columns[i], corr_subset.columns[j], corr_val))

if high_corr_pairs:
    print(f"   Found {len(high_corr_pairs)} highly correlated feature pairs (>0.7):")
    for feat1, feat2, corr_val in high_corr_pairs[:5]:
        print(f"      - {feat1} ‚Üî {feat2}: {corr_val:.3f}")
else:
    print(f"   ‚úÖ No highly correlated features found (good for model stability)")

print(f"\nüìã Next Steps:")
print(f"   1. Review feature importance to identify key ASD markers")
print(f"   2. Consider collecting more data to improve model stability")
print(f"   3. Validate model on new children (if available)")
print(f"   4. Monitor model performance over time")

print(f"\n‚ö†Ô∏è  Scientific Limitations & Framing:")
print(f"   - Dataset size: 53-58 children (pilot study range)")
print(f"   - Results are from cross-validation, not independent clinical validation")
print(f"   - This is a SCREENING tool, not a diagnostic tool")
print(f"   - Performance may vary by age group and developmental stage")
print(f"   - Model should be validated on independent data before clinical use")

print(f"\n‚úÖ Strengths of This Approach:")
print(f"   - Extensive visualization demonstrates research maturity")
print(f"   - Child-level cross-validation prevents data leakage")
print(f"   - Age normalization accounts for developmental differences")
print(f"   - Conservative feature engineering avoids overfitting")
print(f"   - Focus on sensitivity (recall) is appropriate for screening")

print("\n" + "=" * 70)


## Step 10: Advanced Visualizations & Feature Engineering

This section includes:
- üìä Model performance visualizations (ROC curves, confusion matrices)
- üîç Feature correlation analysis
- üéØ Advanced feature engineering techniques
- üìà Data distribution analysis


### 10.1: ROC Curves & Model Performance Comparison


In [None]:
# Generate ROC curves for both models
from sklearn.metrics import roc_curve, auc

# Get predictions from cross-validation
def get_cv_predictions(model, X, y, groups, cv):
    """Get cross-validation predictions"""
    y_pred_proba = np.zeros(len(y))
    y_pred = np.zeros(len(y))
    
    for train_idx, test_idx in cv.split(X, y, groups):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        model.fit(X_train, y_train)
        y_pred_proba[test_idx] = model.predict_proba(X_test)[:, 1]
        y_pred[test_idx] = model.predict(X_test)
    
    return y_pred, y_pred_proba

# Get predictions
lr_pred, lr_pred_proba = get_cv_predictions(lr, X_scaled, y, groups, gkf)
svm_pred, svm_pred_proba = get_cv_predictions(svm, X_scaled, y, groups, gkf)

# Calculate ROC curves
lr_fpr, lr_tpr, _ = roc_curve(y, lr_pred_proba)
svm_fpr, svm_tpr, _ = roc_curve(y, svm_pred_proba)

lr_auc = auc(lr_fpr, lr_tpr)
svm_auc = auc(svm_fpr, svm_tpr)

# Plot ROC curves
plt.figure(figsize=(10, 8))
plt.plot(lr_fpr, lr_tpr, label=f'Logistic Regression (AUC = {lr_auc:.3f})', linewidth=2)
plt.plot(svm_fpr, svm_tpr, label=f'Linear SVM (AUC = {svm_auc:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier (AUC = 0.500)', linewidth=1)
plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12)
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=12)
plt.title('ROC Curves: Model Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"‚úÖ ROC Curves generated!")
print(f"   Logistic Regression AUC: {lr_auc:.3f}")
print(f"   Linear SVM AUC: {svm_auc:.3f}")


In [None]:
# Confusion Matrices for both models
from sklearn.metrics import confusion_matrix

lr_cm = confusion_matrix(y, lr_pred)
svm_cm = confusion_matrix(y, svm_pred)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Logistic Regression Confusion Matrix
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Control', 'ASD'], yticklabels=['Control', 'ASD'])
axes[0].set_title('Logistic Regression\nConfusion Matrix', fontsize=12, fontweight='bold')
axes[0].set_ylabel('True Label', fontsize=11)
axes[0].set_xlabel('Predicted Label', fontsize=11)

# Calculate metrics
lr_tn, lr_fp, lr_fn, lr_tp = lr_cm.ravel()
lr_sensitivity = lr_tp / (lr_tp + lr_fn) if (lr_tp + lr_fn) > 0 else 0
lr_specificity = lr_tn / (lr_tn + lr_fp) if (lr_tn + lr_fp) > 0 else 0

axes[0].text(0.5, -0.15, f'Sensitivity: {lr_sensitivity:.3f} | Specificity: {lr_specificity:.3f}',
             transform=axes[0].transAxes, ha='center', fontsize=10)

# Linear SVM Confusion Matrix
sns.heatmap(svm_cm, annot=True, fmt='d', cmap='Greens', ax=axes[1],
            xticklabels=['Control', 'ASD'], yticklabels=['Control', 'ASD'])
axes[1].set_title('Linear SVM\nConfusion Matrix', fontsize=12, fontweight='bold')
axes[1].set_ylabel('True Label', fontsize=11)
axes[1].set_xlabel('Predicted Label', fontsize=11)

svm_tn, svm_fp, svm_fn, svm_tp = svm_cm.ravel()
svm_sensitivity = svm_tp / (svm_tp + svm_fn) if (svm_tp + svm_fn) > 0 else 0
svm_specificity = svm_tn / (svm_tn + svm_fp) if (svm_tn + svm_fp) > 0 else 0

axes[1].text(0.5, -0.15, f'Sensitivity: {svm_sensitivity:.3f} | Specificity: {svm_specificity:.3f}',
             transform=axes[1].transAxes, ha='center', fontsize=10)

plt.tight_layout()
plt.show()

print("‚úÖ Confusion matrices generated!")


In [None]:
# Precision-Recall Curves (Important for imbalanced datasets)
from sklearn.metrics import precision_recall_curve, average_precision_score

lr_precision, lr_recall, _ = precision_recall_curve(y, lr_pred_proba)
svm_precision, svm_recall, _ = precision_recall_curve(y, svm_pred_proba)

lr_ap = average_precision_score(y, lr_pred_proba)
svm_ap = average_precision_score(y, svm_pred_proba)

plt.figure(figsize=(10, 8))
plt.plot(lr_recall, lr_precision, label=f'Logistic Regression (AP = {lr_ap:.3f})', linewidth=2)
plt.plot(svm_recall, svm_precision, label=f'Linear SVM (AP = {svm_ap:.3f})', linewidth=2)
plt.xlabel('Recall (Sensitivity)', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curves: Model Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower left', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"‚úÖ Precision-Recall curves generated!")
print(f"   Logistic Regression Average Precision: {lr_ap:.3f}")
print(f"   Linear SVM Average Precision: {svm_ap:.3f}")


### 10.2: Feature Correlation Analysis


In [None]:
# Feature correlation matrix (top features only)
top_n_features = 20
top_features_list = feature_importance.head(top_n_features)['Feature'].tolist()

# Get correlation matrix for top features
corr_data = X_scaled[top_features_list].copy()
corr_data['target'] = y.values
corr_matrix = corr_data.corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Mask upper triangle
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8},
            xticklabels=corr_matrix.columns, yticklabels=corr_matrix.columns)
plt.title(f'Feature Correlation Matrix (Top {top_n_features} Features)', 
          fontsize=14, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("‚úÖ Feature correlation matrix generated!")
print(f"   Analyzed {top_n_features} most important features")


In [None]:
# Correlation with target variable
target_corr = pd.DataFrame({
    'Feature': selected_features,
    'Correlation_with_Target': [X_scaled[f].corr(y) for f in selected_features]
}).sort_values('Correlation_with_Target', key=abs, ascending=False)

print("=" * 60)
print("TOP 20 FEATURES CORRELATED WITH TARGET")
print("=" * 60)
print(target_corr.head(20).to_string(index=False))

# Visualize
plt.figure(figsize=(10, 8))
top_corr = target_corr.head(15)
colors = ['red' if x < 0 else 'green' for x in top_corr['Correlation_with_Target']]
plt.barh(range(len(top_corr)), top_corr['Correlation_with_Target'], color=colors, alpha=0.7)
plt.yticks(range(len(top_corr)), top_corr['Feature'])
plt.xlabel('Correlation with Target (ASD)', fontsize=12)
plt.title('Top 15 Features: Correlation with ASD Target', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\n‚úÖ Target correlation analysis completed!")


### 10.3: Feature Distribution Analysis (ASD vs Control)


In [None]:
# Compare feature distributions between ASD and Control groups
top_5_features = feature_importance.head(5)['Feature'].tolist()

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_5_features):
    if feature in X_scaled.columns:
        asd_values = X_scaled[y == 1][feature]
        control_values = X_scaled[y == 0][feature]
        
        axes[idx].hist(control_values, bins=20, alpha=0.6, label='Control', color='blue', density=True)
        axes[idx].hist(asd_values, bins=20, alpha=0.6, label='ASD', color='red', density=True)
        axes[idx].set_title(f'{feature}', fontsize=11, fontweight='bold')
        axes[idx].set_xlabel('Feature Value (Normalized)', fontsize=10)
        axes[idx].set_ylabel('Density', fontsize=10)
        axes[idx].legend(fontsize=9)
        axes[idx].grid(True, alpha=0.3)

# Remove extra subplot
axes[5].axis('off')

plt.suptitle('Feature Distributions: ASD vs Control (Top 5 Features)', 
             fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

print("‚úÖ Feature distribution analysis completed!")


In [None]:
# Box plots for top features (better for comparing groups)
top_3_features = feature_importance.head(3)['Feature'].tolist()

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for idx, feature in enumerate(top_3_features):
    if feature in X_scaled.columns:
        data_to_plot = [X_scaled[y == 0][feature].values, X_scaled[y == 1][feature].values]
        bp = axes[idx].boxplot(data_to_plot, labels=['Control', 'ASD'], patch_artist=True)
        
        # Color the boxes
        bp['boxes'][0].set_facecolor('lightblue')
        bp['boxes'][1].set_facecolor('lightcoral')
        
        axes[idx].set_title(f'{feature}', fontsize=12, fontweight='bold')
        axes[idx].set_ylabel('Feature Value (Normalized)', fontsize=10)
        axes[idx].grid(True, alpha=0.3, axis='y')

plt.suptitle('Box Plots: Top 3 Features (ASD vs Control)', 
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("‚úÖ Box plots generated!")


### 10.4: Advanced Feature Engineering


In [None]:
# Create interaction features (combinations of important features)
print("üîß Creating interaction features...")

# Get top 5 features for interactions
top_5_for_interaction = feature_importance.head(5)['Feature'].tolist()
top_5_for_interaction = [f for f in top_5_for_interaction if f in X_scaled.columns]

X_enhanced = X_scaled.copy()

# Create meaningful interactions
interactions_created = []

if len(top_5_for_interaction) >= 2:
    # Interaction 1: Top 2 features
    feat1, feat2 = top_5_for_interaction[0], top_5_for_interaction[1]
    X_enhanced[f'{feat1}_x_{feat2}'] = X_scaled[feat1] * X_scaled[feat2]
    interactions_created.append(f'{feat1}_x_{feat2}')
    print(f"   ‚úÖ Created: {feat1}_x_{feat2}")

if len(top_5_for_interaction) >= 3:
    # Interaction 2: Feature 1 and Feature 3
    feat1, feat3 = top_5_for_interaction[0], top_5_for_interaction[2]
    X_enhanced[f'{feat1}_x_{feat3}'] = X_scaled[feat1] * X_scaled[feat3]
    interactions_created.append(f'{feat1}_x_{feat3}')
    print(f"   ‚úÖ Created: {feat1}_x_{feat3}")

# Create polynomial features for top feature (squared term)
if len(top_5_for_interaction) >= 1:
    top_feat = top_5_for_interaction[0]
    X_enhanced[f'{top_feat}_squared'] = X_scaled[top_feat] ** 2
    interactions_created.append(f'{top_feat}_squared')
    print(f"   ‚úÖ Created: {top_feat}_squared")

print(f"\n‚úÖ Created {len(interactions_created)} interaction features")
print(f"   Total features now: {X_enhanced.shape[1]} (was {X_scaled.shape[1]})")


In [None]:
# Test if interaction features improve model performance
print("üß™ Testing enhanced features with interaction terms...")

lr_enhanced = LogisticRegression(
    penalty='l2', C=0.5, class_weight='balanced',
    max_iter=2000, random_state=42
)

lr_enhanced_scores = cross_validate(
    lr_enhanced, X_enhanced, y, groups=groups,
    cv=gkf, scoring=scoring, return_train_score=True
)

print(f"\nüìä Enhanced Model (with interactions) Results:")
for metric in ['test_accuracy', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']:
    scores = lr_enhanced_scores[metric]
    print(f"   {metric.replace('test_', '').upper()}: {scores.mean():.3f} ¬± {scores.std():.3f}")

print(f"\nüìä Original Model (without interactions) Results:")
for metric in ['test_accuracy', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']:
    scores = lr_scores[metric]
    print(f"   {metric.replace('test_', '').upper()}: {scores.mean():.3f} ¬± {scores.std():.3f}")

# Compare
recall_improvement = lr_enhanced_scores['test_recall'].mean() - lr_scores['test_recall'].mean()
auc_improvement = lr_enhanced_scores['test_roc_auc'].mean() - lr_scores['test_roc_auc'].mean()

print(f"\nüìà Improvement:")
print(f"   Recall: {recall_improvement:+.3f}")
print(f"   ROC-AUC: {auc_improvement:+.3f}")

if recall_improvement > 0.01 or auc_improvement > 0.01:
    print(f"\n‚úÖ Enhanced features show improvement! Consider using them.")
else:
    print(f"\n‚ö†Ô∏è  Enhanced features show minimal improvement. Original features may be sufficient.")
