In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import classification_report

# SMOTE Balancing Function
def apply_smote(X, y):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

# Function to downsample the larger datasets
def downsample(X, y, target_size):
    X_downsampled, y_downsampled = resample(X, y, replace=True, n_samples=target_size, random_state=42)
    return X_downsampled, y_downsampled

# 1. Load and Preprocess Image Data from train/test/val directories
# 1. Load and Preprocess Image Data from train/test/val directories
def load_image_data(image_dir):
    datagen = ImageDataGenerator(rescale=1./255)
    image_data = datagen.flow_from_directory(image_dir, target_size=(224, 224), batch_size=32, class_mode='binary', shuffle=False)
    
    # Use pretrained ResNet50 for feature extraction
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    
    # Extract features from images
    features = base_model.predict(image_data)
    
    image_features = features.reshape(features.shape[0], -1)  # Flatten features
    labels = image_data.classes  # Get labels from directory structure
    
    # Apply SMOTE
    image_features, labels = apply_smote(image_features, labels)
    
    return image_features, labels


# 2. Load and Preprocess EEG Data from CSV
def load_eeg_data(csv_file):
    eeg_data = pd.read_csv(csv_file)
    encode = ({'NEUTRAL': 0, 'POSITIVE': 1, 'NEGATIVE': 0} )
    eeg_data = eeg_data.replace(encode)
    scaler = StandardScaler()
    eeg_data_scaled = scaler.fit_transform(eeg_data.drop('label', axis=1))  # Assuming 'label' column contains the labels
    
    labels = eeg_data['label'].values  # Extract labels
    
    eeg_data_balanced, labels = apply_smote(eeg_data, labels)  # Apply SMOTE
    
    return eeg_data_balanced, labels

# 3. Load and Preprocess AQ10 Data from CSV
def load_aq10_data(csv_file):
    aq10_data = pd.read_csv(csv_file)
    aq10_data = aq10_data.replace({'YES':1, 'NO':0, '?':'Others', 'others':'Others'})
    aq10_data = aq10_data.replace({'yes':1, 'no':0, '?':'Others', 'others':'Others'})
    aq10_data = aq10_data.drop(columns=['autism'])
    aq10_data['gender'].replace('m',0,inplace=True)
    aq10_data['gender'].replace('f',1,inplace=True)
    aq10_data = pd.get_dummies(aq10_data, columns=['ethnicity'])
    aq10_data = aq10_data.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)
    aq10_data = pd.get_dummies(aq10_data, columns=['relation'])
    aq10_data = aq10_data.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)
    aq10_data.drop("age_desc",axis=1,inplace=True)
    for i in aq10_data['age']:
        q1 =aq10_data['age'].quantile(0.25)
        q3 = aq10_data['age'].quantile(0.75)
        iqr = q3 - q1
        upper_tail = q3 + 1.5 * iqr
        lower_tail = q1 - 1.5 * iqr
        if i > upper_tail or i < lower_tail:
            if i> upper_tail:
               aq10_data['age'] = aq10_data['age'].replace(i, upper_tail)
            else:
                aq10_data['age'] = aq10_data['age'].replace(i, np.mean(i))
    aq10_data['age'] = aq10_data['age'].fillna(0) 
    aq10_data=aq10_data.drop(columns=['country_of_res'])
    aq10_data=aq10_data.drop(columns=['used_app_before'])
    aq10_data=aq10_data.drop(columns=['result'])
    scaler = StandardScaler()
    aq10_data_scaled = scaler.fit_transform(aq10_data.drop('Class/ASD', axis=1))  # Assuming 'label' column contains the labels
    
    labels = aq10_data['Class/ASD'].values  # Extract labels
    
    aq10_data_balanced, labels = apply_smote(aq10_data_scaled, labels)  # Apply SMOTE
    
    return aq10_data_balanced, labels

# 4. Equalize the Dataset Sizes
def equalize_datasets(*datasets):
    # Get the minimum dataset size
    min_size = max([len(data[0]) for data in datasets])
    
    # Downsample each dataset to the minimum size
    downsampled_datasets = [downsample(data[0], data[1], min_size) for data in datasets]
    
    return downsampled_datasets

# 5. Combine Features
def combine_features(image_features, eeg_features, aq10_features):
    combined_features = np.concatenate([image_features, eeg_features, aq10_features], axis=1)
    return combined_features

# 6. Neural Network Training
def train_combined_model(X_train, y_train, X_test, y_test):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification (0 or 1)
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))
    
    return model

# 7. Full Pipeline with Equalized Datasets
def autism_prediction_pipeline(image_dir, eeg_csv_file, aq10_csv_file):
    # Split labels into train/test sets based on your image folder split
    image_features_train, y_train_img = load_image_data(f'{image_dir}/train')
    eeg_features_train, y_train_eeg = load_eeg_data(eeg_csv_file)
    aq10_features_train, y_train_aq10 = load_aq10_data(aq10_csv_file)
    
    # Ensure all datasets are of equal size
    datasets_train = equalize_datasets((image_features_train, y_train_img),
                                       (eeg_features_train, y_train_eeg),
                                       (aq10_features_train, y_train_aq10))
    
    # Combine features for training
    combined_train_features = combine_features(datasets_train[0][0], datasets_train[1][0], datasets_train[2][0])
    combined_train_labels = datasets_train[0][1]
    
    # Load test datasets
    image_features_test, y_test_img = load_image_data(f'{image_dir}/test')
    eeg_features_test, y_test_eeg = load_eeg_data(eeg_csv_file)
    aq10_features_test, y_test_aq10 = load_aq10_data(aq10_csv_file)
    
    datasets_test = equalize_datasets((image_features_test, y_test_img),
                                      (eeg_features_test, y_test_eeg),
                                      (aq10_features_test, y_test_aq10))
    
    combined_test_features = combine_features(datasets_test[0][0], datasets_test[1][0], datasets_test[2][0])
    combined_test_labels = datasets_test[0][1]
    
    # Train the model
    model = train_combined_model(combined_train_features, combined_train_labels, combined_test_features, combined_test_labels)
    
    # Evaluate the model
    y_pred = model.predict(combined_test_features)
    y_pred_binary = np.where(y_pred > 0.5, 1, 0)
    
    print(classification_report(combined_test_labels, y_pred_binary))

# Example Usage
autism_prediction_pipeline('../Notebooks/Image/output', '../Notebooks/EEG/preprocessed_dataset.csv', '../Notebooks/Behavioural/autism_data.csv')


Found 2352 images belonging to 2 classes.


  self._warn_if_super_not_called()


[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 1s/step


  eeg_data = eeg_data.replace(encode)
  aq10_data = aq10_data.replace({'YES':1, 'NO':0, '?':'Others', 'others':'Others'})
  aq10_data = aq10_data.replace({'yes':1, 'no':0, '?':'Others', 'others':'Others'})
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  aq10_data['gender'].replace('m',0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the

Found 294 images belonging to 2 classes.


  self._warn_if_super_not_called()


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1s/step


  eeg_data = eeg_data.replace(encode)
  aq10_data = aq10_data.replace({'YES':1, 'NO':0, '?':'Others', 'others':'Others'})
  aq10_data = aq10_data.replace({'yes':1, 'no':0, '?':'Others', 'others':'Others'})
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  aq10_data['gender'].replace('m',0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the

Epoch 1/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 437ms/step - accuracy: 0.5158 - loss: 22048810205184.0000 - val_accuracy: 0.4975 - val_loss: 15177742286848.0000
Epoch 2/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 417ms/step - accuracy: 0.5175 - loss: 14608279535616.0000 - val_accuracy: 0.5007 - val_loss: 23376221437952.0000
Epoch 3/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 418ms/step - accuracy: 0.5234 - loss: 10987811897344.0000 - val_accuracy: 0.5098 - val_loss: 11506198511616.0000
Epoch 4/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 417ms/step - accuracy: 0.5174 - loss: 18747655979008.0000 - val_accuracy: 0.5074 - val_loss: 4988482879488.0000
Epoch 5/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 412ms/step - accuracy: 0.4991 - loss: 14456650203136.0000 - val_accuracy: 0.5147 - val_loss: 9162962501632.0000
Epoch 6/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3