# üß† SenseAI ASD Screening - ML Model Training

This notebook trains ML models for ASD detection using age-stratified cognitive assessment data.

**Age Groups:**
- Age 2-3: Parental Questionnaire
- Age 3.5-5: Frog Jump Game
- Age 5.5-6+: DCCS Game

## How to Use:
1. Upload `SAMPLE_DATASETS/` folder to Google Drive
2. Run all cells in order
3. Download trained models


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install required packages
!pip install pandas numpy scikit-learn xgboost matplotlib seaborn -q


In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print('‚úÖ Libraries loaded successfully!')


## üìÅ Load Datasets

Adjust the `base_path` to match your Google Drive folder location.


In [None]:
# Set your Google Drive path
base_path = '/content/drive/MyDrive/SAMPLE_DATASETS/'

# Load DCCS (Age 5.5-6+) datasets
df_dccs_asd = pd.read_csv(base_path + 'age_5_6_dccs_asd.csv')
df_dccs_control = pd.read_csv(base_path + 'age_5_6_dccs_control.csv')
df_dccs = pd.concat([df_dccs_asd, df_dccs_control], ignore_index=True)

# Load Frog Jump (Age 3.5-5) datasets
df_fj_asd = pd.read_csv(base_path + 'age_3_5_frog_jump_asd.csv')
df_fj_control = pd.read_csv(base_path + 'age_3_5_frog_jump_control.csv')
df_frog_jump = pd.concat([df_fj_asd, df_fj_control], ignore_index=True)

# Load Questionnaire (Age 2-3) datasets
df_q_asd = pd.read_csv(base_path + 'age_2_3_questionnaire_asd.csv')
df_q_control = pd.read_csv(base_path + 'age_2_3_questionnaire_control.csv')
df_questionnaire = pd.concat([df_q_asd, df_q_control], ignore_index=True)

print(f'üìä DCCS (Age 5.5-6+): {len(df_dccs)} samples (ASD: {len(df_dccs_asd)}, Control: {len(df_dccs_control)})')
print(f'üê∏ Frog Jump (Age 3.5-5): {len(df_frog_jump)} samples (ASD: {len(df_fj_asd)}, Control: {len(df_fj_control)})')
print(f'üìù Questionnaire (Age 2-3): {len(df_questionnaire)} samples (ASD: {len(df_q_asd)}, Control: {len(df_q_control)})')


## üé® Train DCCS Model (Age 5.5-6+)

The DCCS model uses perseverative errors, switch cost, and post-switch accuracy as primary ASD markers.


In [None]:
# DCCS Features (Age 5.5-6+)
dccs_features = [
    'age_months', 'pre_switch_accuracy', 'post_switch_accuracy', 
    'mixed_accuracy', 'overall_accuracy', 'avg_rt_ms',
    'switch_cost_ms', 'perseverative_errors', 'perseverative_error_rate',
    'max_consecutive_perseverations', 'total_rule_switch_errors', 
    'longest_streak', 'attention_level', 'engagement_level'
]

# Prepare data
X_dccs = df_dccs[dccs_features]
y_dccs = df_dccs['asd_label']  # Binary: 0=Control, 1=ASD

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_dccs, y_dccs, test_size=0.2, random_state=42, stratify=y_dccs
)

# Scale features
scaler_dccs = StandardScaler()
X_train_scaled = scaler_dccs.fit_transform(X_train)
X_test_scaled = scaler_dccs.transform(X_test)

print(f'Training samples: {len(X_train)} (ASD: {sum(y_train==1)}, Control: {sum(y_train==0)})')
print(f'Test samples: {len(X_test)} (ASD: {sum(y_test==1)}, Control: {sum(y_test==0)})')


In [None]:
# Train multiple models and compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'SVM': SVC(kernel='rbf', probability=True)
}

results = {}
trained_models = {}

print('üöÄ Training DCCS Models...\n')
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    trained_models[name] = model
    print(f'‚úÖ {name}: {accuracy:.2%}')

# Best model
best_model_name = max(results, key=results.get)
print(f'\nüèÜ Best Model: {best_model_name} ({results[best_model_name]:.2%})')


In [None]:
# Visualize model comparison
plt.figure(figsize=(12, 6))
colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6', '#f39c12']
bars = plt.bar(results.keys(), results.values(), color=colors)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('üé® DCCS Model Comparison (Age 5.5-6+ ASD Detection)', fontsize=14)
plt.ylim([0.5, 1.05])
plt.xticks(rotation=45, ha='right')
for bar, (name, acc) in zip(bars, results.items()):
    plt.text(bar.get_x() + bar.get_width()/2, acc + 0.02, f'{acc:.1%}', ha='center', fontsize=11, fontweight='bold')
plt.tight_layout()
plt.show()


## üìä Feature Importance Analysis

Identifying which cognitive markers are most predictive of ASD.


In [None]:
# Feature importance from Random Forest
rf_model = trained_models['Random Forest']
feature_importance = pd.DataFrame({
    'Feature': dccs_features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=True)

# Plot horizontal bar chart
plt.figure(figsize=(12, 8))
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(feature_importance)))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], color=colors)
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('üìä Feature Importance for ASD Detection (DCCS)', fontsize=14)
plt.tight_layout()
plt.show()

# Top 5 features
print('\nüéØ Top 5 Most Important Features:')
for i, row in feature_importance.tail(5).iloc[::-1].iterrows():
    print(f"  ‚Ä¢ {row['Feature']}: {row['Importance']:.4f}")


## üíæ Save Model


In [None]:
import joblib

# Save the best model and scaler
best_model = trained_models[best_model_name]
joblib.dump(best_model, 'dccs_asd_model.pkl')
joblib.dump(scaler_dccs, 'dccs_scaler.pkl')
print(f'‚úÖ Model saved: dccs_asd_model.pkl')
print(f'‚úÖ Scaler saved: dccs_scaler.pkl')

# Download to local (uncomment in Colab)
# from google.colab import files
# files.download('dccs_asd_model.pkl')
# files.download('dccs_scaler.pkl')


## üîÆ Predict New Child


In [None]:
# Example: Predict for a new child
new_child = {
    'age_months': 70,
    'pre_switch_accuracy': 87.5,
    'post_switch_accuracy': 45.0,  # Low - ASD indicator
    'mixed_accuracy': 50.0,
    'overall_accuracy': 58.0,
    'avg_rt_ms': 1400,
    'switch_cost_ms': 480,  # High - ASD indicator
    'perseverative_errors': 6,  # High - ASD indicator
    'perseverative_error_rate': 50.0,
    'max_consecutive_perseverations': 3,
    'total_rule_switch_errors': 9,
    'longest_streak': 5,
    'attention_level': 2,
    'engagement_level': 3
}

# Convert to DataFrame
new_child_df = pd.DataFrame([new_child])
new_child_scaled = scaler_dccs.transform(new_child_df)

# Predict
prediction = best_model.predict(new_child_scaled)
probability = best_model.predict_proba(new_child_scaled)

print('üîÆ PREDICTION RESULT')
print('=' * 40)
print(f'Diagnosis: {"üî¥ ASD RISK" if prediction[0] == 1 else "üü¢ No ASD Concern"}')
print(f'Confidence: {max(probability[0]):.1%}')
print(f'ASD Probability: {probability[0][1]:.1%}')
print(f'Control Probability: {probability[0][0]:.1%}')
