In [1]:
# ==============================
# STEP 0: INSTALL & IMPORTS
# ==============================

import warnings
warnings.filterwarnings("ignore")
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow warnings

import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    LSTM, Dense, Dropout, BatchNormalization
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

In [2]:
# ==============================
# STEP 1: LOAD DATA
# ==============================

FILE_PATH = "ABIDEII_Composite_Phenotypic.csv"

df = pd.read_csv(FILE_PATH, encoding="latin1", engine="python")
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (1114, 348)


Unnamed: 0,SITE_ID,SUB_ID,NDAR_GUID,DX_GROUP,PDD_DSM_IV_TR,ASD_DSM_5,AGE_AT_SCAN,SEX,HANDEDNESS_CATEGORY,HANDEDNESS_SCORES,...,ADI_R_C3_TOTAL,ADI_R_C4_REPETITIVE_USE_OBJECTS,ADI_R_C4_HIGHER,ADI_R_C4_UNUSUAL_SENSORY_INTERESTS,ADI_R_C4_TOTAL,ADI_R_D_AGE_PARENT_NOTICED,ADI_R_D_AGE_FIRST_SINGLE_WORDS,ADI_R_D_AGE_FIRST_PHRASES,ADI_R_D_AGE_WHEN_ABNORMALITY,ADI_R_D_INTERVIEWER_JUDGMENT
0,ABIDEII-BNI_1,29006,,1,,,48.0,1,1.0,,...,,,,,,,,,,
1,ABIDEII-BNI_1,29007,,1,,,41.0,1,1.0,,...,,,,,,,,,,
2,ABIDEII-BNI_1,29008,,1,,,59.0,1,1.0,,...,,,,,,,,,,
3,ABIDEII-BNI_1,29009,,1,,,57.0,1,1.0,,...,,,,,,,,,,
4,ABIDEII-BNI_1,29010,,1,,,45.0,1,1.0,,...,,,,,,,,,,


In [3]:
# ==============================
# STEP 2: TARGET & FEATURE SPLIT
# ==============================

TARGET = "DX_GROUP"   # 1 = ASD, 2 = Control

# Binary classification (not regression)
y = df[TARGET].map({1: 1, 2: 0})

drop_cols = ["DX_GROUP", "SUB_ID", "FILE_ID", "SITE_ID"]
X = df.drop(columns=[c for c in drop_cols if c in df.columns])

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())

X shape: (1114, 345)
y distribution:
 DX_GROUP
0    593
1    521
Name: count, dtype: int64


In [4]:
# ==============================
# STEP 3: PREPROCESSING
# ==============================

num_cols = X.select_dtypes(include=[np.number]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Numerical imputation
if not num_cols.empty:
    for col in num_cols:
        median_val = X[col].median()
        if pd.isna(median_val):
            X[col] = X[col].fillna(0)
        else:
            X[col] = X[col].fillna(median_val)

# Impute categorical columns
if not cat_cols.empty:
    for col in cat_cols:
        col_values = X[col].astype(str).values.reshape(-1, 1)
        X.loc[:, col] = pd.Series(cat_imputer.fit_transform(col_values).ravel(), index=X.index)

    for col in cat_cols:
        X[col] = LabelEncoder().fit_transform(X[col])

# Scaling
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Reshape for LSTM (samples, timesteps, features)
X = X.values.reshape(X.shape[0], X.shape[1], 1)

print("LSTM input shape:", X.shape)

LSTM input shape: (1114, 345, 1)


In [5]:
# ==============================
# STEP 4: TRAIN / TEST SPLIT
# ==============================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train:", X_train.shape)
print("Test:", X_test.shape)

Train: (891, 345, 1)
Test: (223, 345, 1)


In [6]:
# ==============================
# STEP 5: BUILD LSTM MODEL
# ==============================

model = Sequential([
    LSTM(128, return_sequences=True, input_shape=X.shape[1:]),
    BatchNormalization(),
    Dropout(0.3),

    LSTM(64),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation="relu"),
    Dropout(0.3),

    Dense(1, activation="sigmoid")  # Binary classification
])

model.compile(
    optimizer=Adam(learning_rate=1e-4),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [7]:
# ==============================
# STEP 6: SETUP CALLBACKS
# ==============================

callbacks = [
    EarlyStopping(
        monitor="val_loss",
        patience=8,
        restore_best_weights=True
    ),
    ModelCheckpoint(
        "best_lstm_asd_model.h5",
        monitor="val_accuracy",
        save_best_only=True,
        verbose=1
    )
]

print("✅ Callbacks configured!")

✅ Callbacks configured!


In [8]:
# ==============================
# STEP 7: TRAIN MODEL
# ==============================

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 474ms/step - accuracy: 0.5167 - loss: 0.7457
Epoch 1: val_accuracy improved from None to 0.59218, saving model to best_lstm_asd_model.h5





Epoch 1: finished saving model to best_lstm_asd_model.h5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 633ms/step - accuracy: 0.5183 - loss: 0.7329 - val_accuracy: 0.5922 - val_loss: 0.6903
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420ms/step - accuracy: 0.4860 - loss: 0.7489
Epoch 2: val_accuracy did not improve from 0.59218
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 483ms/step - accuracy: 0.5239 - loss: 0.7297 - val_accuracy: 0.5922 - val_loss: 0.6901
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 394ms/step - accuracy: 0.5081 - loss: 0.7177
Epoch 3: val_accuracy improved from 0.59218 to 0.60894, saving model to best_lstm_asd_model.h5





Epoch 3: finished saving model to best_lstm_asd_model.h5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 464ms/step - accuracy: 0.5337 - loss: 0.7086 - val_accuracy: 0.6089 - val_loss: 0.6899
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 434ms/step - accuracy: 0.5394 - loss: 0.6992
Epoch 4: val_accuracy did not improve from 0.60894
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 487ms/step - accuracy: 0.5197 - loss: 0.7050 - val_accuracy: 0.4413 - val_loss: 0.6893
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 504ms/step - accuracy: 0.4846 - loss: 0.7173
Epoch 5: val_accuracy did not improve from 0.60894
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 559ms/step - accuracy: 0.4775 - loss: 0.7152 - val_accuracy: 0.4413 - val_loss: 0.6886
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 427ms/step - accuracy: 0.5075 - loss: 0.7044
Epoch 6: val_accuracy




Epoch 30: finished saving model to best_lstm_asd_model.h5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 264ms/step - accuracy: 0.5576 - loss: 0.6771 - val_accuracy: 0.6145 - val_loss: 0.6466
Epoch 31/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 235ms/step - accuracy: 0.5164 - loss: 0.6826
Epoch 31: val_accuracy did not improve from 0.61453
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 269ms/step - accuracy: 0.5309 - loss: 0.6825 - val_accuracy: 0.6145 - val_loss: 0.6487
Epoch 32/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 370ms/step - accuracy: 0.5532 - loss: 0.6715
Epoch 32: val_accuracy did not improve from 0.61453
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 400ms/step - accuracy: 0.5492 - loss: 0.6702 - val_accuracy: 0.6145 - val_loss: 0.6515
Epoch 33/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 288ms/step - accuracy: 0.5355 - loss: 0.6533
Epoch 33: val_accu

In [9]:
# ==============================
# STEP 8: EVALUATION
# ==============================

y_pred_prob = model.predict(X_test).ravel()
y_pred = (y_pred_prob > 0.5).astype(int)

acc = accuracy_score(y_test, y_pred) * 100
roc = roc_auc_score(y_test, y_pred_prob)

print(f"Accuracy: {acc:.2f}%")
print(f"ROC-AUC: {roc:.4f}")

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 115ms/step
Accuracy: 56.50%
ROC-AUC: 0.5302

Classification Report:

              precision    recall  f1-score   support

           0       0.55      1.00      0.71       119
           1       1.00      0.07      0.13       104

    accuracy                           0.57       223
   macro avg       0.78      0.53      0.42       223
weighted avg       0.76      0.57      0.44       223



In [10]:
# ==============================
# STEP 9: SAVE MODEL
# ==============================

model.save("final_lstm_asd_model.keras")
print("✅ LSTM model saved as final_lstm_asd_model.keras")

✅ LSTM model saved as final_lstm_asd_model.keras
