<a href="https://colab.research.google.com/github/RayirthDinesh/Autism-Spectrum-Disorder-Prediction/blob/main/ASDPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [None]:
#imports
from google.colab import files
from keras.regularizers import l2
from numpy import mean
from numpy import std
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Data Collection

In [None]:
#uploading data

#https://www.kaggle.com/competitions/autismdiagnosis/data - Download train.csv

# Construct the direct download URL
url = "https://raw.githubusercontent.com/RayirthDinesh/Autism-Spectrum-Disorder-Prediction/refs/heads/main/train.csv"

# Load the CSV file into a pandas DataFrame
autism_data = pd.read_csv(url)



**Collect information about the data**


1.   Check for number of samples and features
2.   Number of Null Values




In [None]:
# Get the number of samples (rows)
num_samples = autism_data.shape[0]

# Get the number of features (columns)
num_features = autism_data.shape[1]

print(f'The dataset has {num_samples} samples and {num_features} features.')
autism_data.head()

In [None]:
#check for null values
autism_data.info()

In [None]:
#check for statistical measure of the dataset
autism_data.describe().T

In [None]:
#value counts on categorical columns
print('*** Relation ***')
print(autism_data['relation'].value_counts())

print('\n\n*** Ethnicity ***')
print(autism_data['ethnicity'].value_counts())

print('\n\n*** Jaundice ***')
print(autism_data['jaundice'].value_counts())

print('\n\n*** Austim ***')
print(autism_data['austim'].value_counts())

print('\n\n*** Gender ***')
print(autism_data['gender'].value_counts())

print('\n\n*** Country of Residence ***')
print(autism_data['contry_of_res'].value_counts())

print('\n\n*** Used App Before ***')
print(autism_data['used_app_before'].value_counts())

# Preprocessing Data

**Preprocess data**
1. Convert categorical features to numerical
2. Run PCA to remove feature space
3. Remove ID column - unique for each row

In [None]:
#categorical features to numerical
def preprocess_data(data, categorical_features):
    le = LabelEncoder()
    for feature in categorical_features:
        data[feature] = le.fit_transform(data[feature])
    return data

categorical_features = [
    'gender', 'ethnicity', 'jaundice', 'austim',
    'contry_of_res', 'used_app_before', 'age_desc', 'relation'
]

#gender: f = 0, m = 1
#ethnicity = (need to sort more approiately in case test has ethnicity that isn't listed in the dataset)
#jaundice = no = 0, yes = 1
#austim = no = 0, yes = 1
#contry_of_res = (need to sort more approiately in case test has ethnicity that isn't listed in the dataset)
#used_app_before = no = 0, yes = 1
#age_desc = 18 and more = 0 (all values are 0)
#relation = ? = 0, Health care professional = 1, Others = 2, Parent = 3, Relative = 4, self = 5

autism_data['ethnicity'].replace({'?': 'Others', 'others': 'Others'}, inplace=True)
autism_data = preprocess_data(autism_data, categorical_features)

autism_data.head()

In [None]:
#Run PCA
def runPCA():
  features = autism_data.columns
  x = autism_data.loc[:, features].values
  x = StandardScaler().fit_transform(x)

  #Code to determine elbow point (at 22)
  pca = PCA().fit(x)
  plt.plot(np.cumsum(pca.explained_variance_ratio_))
  plt.xlabel('Number of Components')
  plt.ylabel('Cumulative Explained Variance')
  plt.xticks(np.arange(1, len(pca.explained_variance_ratio_) + 1, 1))

  plt.show()

runPCA()

In [None]:
#Features correlation with the Class/ASD
corr = autism_data.corr()

# Extract the correlation with the target variable
target_corr = corr[['Class/ASD']]

# Convert to DataFrame for heatmap

# Heatmap of correlation between each feature and Class/ASD
plt.figure(figsize=(5, 8))
sns.heatmap(target_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation with Target Variable')
plt.show()

#Based on graph the most correlated features are A3_Score, A6_Score, A9_Score and the least correlated is gender and used_app_before

In [None]:
#Remove columns that aren't useful

#Remove ID Column - unique on each row
autism_data = autism_data.drop(columns=['ID'])

# age_desc is always "18 and more"
autism_data = autism_data.drop(columns=['age_desc'])

#Very low correlation (from the heatmap)
autism_data = autism_data.drop(columns=['used_app_before'])

runPCA()

**Obtain final X and Y vectors and split train and test data**

In [None]:
# Exclude the target variable from the original features
final_X = autism_data.drop(columns=['Class/ASD'])

#final y vector with the labels for each of the samples
final_Y = autism_data['Class/ASD']

# Spliting dataset into training and testing sections using the train_test_split function.
train_X, test_X, train_Y, test_Y = train_test_split(final_X, final_Y, test_size=0.15, random_state=23)

# Data Visualization

**Data Visualization**

Includes:
1.   Gender distribution with Class/ASD
2.   Jaundice distribution with Class/ASD
3.   Austim distribution with Class/ASD
4.   Heatmap correlation of each feature with Class/ASD
5.   Heatmap correalation of each feature and target with each other





In [None]:
#Gender and Class/ASD distribution
sns.barplot(x='gender', y='Class/ASD', hue='gender', data = autism_data)

In [None]:
#Jaundice and Class/ASD distribution
sns.barplot(x='jaundice', y='Class/ASD', hue='jaundice', data = autism_data)

In [None]:
#austim and Class/ASD distribution
sns.barplot(x='austim', y='Class/ASD', hue= 'austim', data = autism_data)

In [None]:
# Cofounding: Features correlation with each other
# Heatmap of correlation between each feature
corr = autism_data.corr()
plt.figure(figsize=(20, 15))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

# Models


**Model #1: Logistic regression**

In [None]:
# Initialize the logistic regression model
log_reg = LogisticRegression(solver='lbfgs',max_iter=1000,random_state=23)

# Fit the model on the training data
log_reg.fit(train_X, train_Y)

# Make predictions on the test data
pred_Y_log = log_reg.predict(test_X)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(test_Y, pred_Y_log))

print("\nClassification Report:")
print(classification_report(test_Y, pred_Y_log))

scorelog = accuracy_score(test_Y, pred_Y_log)
print('Logistic Regression Model Accuracy: {:.2%}'.format(scorelog))

#Display ROC curve
pred_prob_log = log_reg.predict_proba(test_X)[:, 1]
fpr, tpr, _ = metrics.roc_curve(test_Y, pred_prob_log, pos_label=log_reg.classes_[1])
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='processing estimator')
display.plot()
plt.show()


#Check feature importance for logistic regression
rfe = RFE(estimator=LogisticRegression(solver='lbfgs',max_iter=1000, random_state=23), n_features_to_select=1, step=1)
rfe = rfe.fit(train_X, train_Y)
ranking = rfe.ranking_

feature_names = train_X.columns
print(" ")
for rank, feature in sorted(zip(ranking, feature_names)):
    print(f"Feature: {feature}, Rank: {rank}")
print(" ")

**Model #2: Random Forest**

In [None]:
# Initialize the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=200, random_state=23)

# Fit the model on the training data
rf_clf.fit(train_X, train_Y)

# Make predictions on the test data
pred_Y_rf = rf_clf.predict(test_X)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(test_Y, pred_Y_rf))

print("\nClassification Report:")
print(classification_report(test_Y, pred_Y_rf))

scorerf = accuracy_score(test_Y, pred_Y_rf)
print('Random Forest Model Accuracy: {:.2%}'.format(scorerf))

#Display ROC curve
pred_prob_rf = rf_clf.predict_proba(test_X)[:, 1]
fpr, tpr, _ = metrics.roc_curve(test_Y, pred_prob_rf, pos_label=rf_clf.classes_[1])
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='processing estimator')
display.plot()
plt.show()

#Display feature importance for random forest
rfe_rf = RFE(estimator= RandomForestClassifier(), n_features_to_select=1, step=1)
rfe_rf = rfe_rf.fit(train_X, train_Y)
ranking = rfe_rf.ranking_

feature_names = train_X.columns
print(" ")
for rank, feature in sorted(zip(ranking, feature_names)):
    print(f"Feature: {feature}, Rank: {rank}")
print(" ")

**Model #3: SVM (Support Vector Machine)**

In [None]:
# Initialize the SVM classifier
svm_clf = SVC(kernel='linear', probability=True,  random_state=23)

# Fit the model on the training data
svm_clf.fit(train_X, train_Y)

# Make predictions on the test data
pred_Y_svm = svm_clf.predict(test_X)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(test_Y, pred_Y_svm))

print("\nClassification Report:")
print(classification_report(test_Y, pred_Y_svm))

scoresvm = accuracy_score(test_Y, pred_Y_svm)
print('SVM (Support Vector Machine) Model Accuracy: {:.2%}'.format(scoresvm))

#Display ROC curve
pred_prob_svm = svm_clf.predict_proba(test_X)[:, 1]
fpr, tpr, _ = metrics.roc_curve(test_Y, pred_prob_svm, pos_label=svm_clf.classes_[1])
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='processing estimator')
display.plot()
plt.show()

#Display feature importance for SVM
rfe_svm = RFE(estimator=SVC(kernel='linear'), n_features_to_select=1, step=1)
rfe_svm = rfe_svm.fit(train_X, train_Y)
ranking = rfe_svm.ranking_

feature_names = train_X.columns
print(" ")
for rank, feature in sorted(zip(ranking, feature_names)):
    print(f"Feature: {feature}, Rank: {rank}")
print(" ")

**Model #4: Decision trees**

In [None]:
# Initialize the Decision Trees classifier
dt_clf = DecisionTreeClassifier(random_state=23)

# Fit the model on the training data
dt_clf.fit(train_X, train_Y)

# Make predictions on the test data
pred_Y_dt = dt_clf.predict(test_X)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(test_Y, pred_Y_dt))

print("\nClassification Report:")
print(classification_report(test_Y, pred_Y_dt))

scoredt = accuracy_score(test_Y, pred_Y_dt)
print('Decision Trees Model Accuracy: {:.2%}'.format(scoredt))

#Display ROC curve
pred_prob_dt = dt_clf.predict_proba(test_X)[:, 1]
fpr, tpr, _ = metrics.roc_curve(test_Y, pred_prob_dt, pos_label=dt_clf.classes_[1])
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='processing estimator')
display.plot()
plt.show()

**Model #5: Ridge Classifier**

In [None]:
# Initialize the Ridge classifier
r_clf = RidgeClassifier(random_state=23)

# Fit the model on the training data
r_clf.fit(train_X, train_Y)

# Make predictions on the test data
pred_Y_r = r_clf.predict(test_X)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(test_Y, pred_Y_r))

print("\nClassification Report:")
print(classification_report(test_Y, pred_Y_r))

scorer = accuracy_score(test_Y, pred_Y_r)
print('Ridge Classifier Model Accuracy: {:.2%}'.format(scorer))


**Model #6: kNeighbors Classifier**

In [None]:
# Initialize the  KNeighbors classifier
kN_clf = KNeighborsClassifier(n_neighbors=3)

# Fit the model on the training data
kN_clf.fit(train_X, train_Y)

# Make predictions on the test data
pred_Y_KN = kN_clf.predict(test_X)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(test_Y, pred_Y_KN))

print("\nClassification Report:")
print(classification_report(test_Y, pred_Y_KN))

scoreKN = accuracy_score(test_Y, pred_Y_KN)
print('KNeighbors Classifier Model Accuracy: {:.2%}'.format(scoreKN))

#Display ROC curve
pred_prob_kN = kN_clf.predict_proba(test_X)[:, 1]
fpr, tpr, _ = metrics.roc_curve(test_Y, pred_prob_kN, pos_label=kN_clf.classes_[1])
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='processing estimator')
display.plot()
plt.show()

#Display feature importance for kNeighbors classifier
knn_model = KNeighborsClassifier()
selector = SelectKBest(score_func=f_classif, k=10)
pipeline = Pipeline([
    ('feature_selection', selector),
    ('classification', knn_model)
])

pipeline.fit(train_X, train_Y)
scores = selector.scores_
feature_names = train_X.columns
print("Feature Scores:")
for score, feature in sorted(zip(scores, feature_names), reverse=True):
    print(f"Feature: {feature}, Score: {score}")
print(" ")

**Model #7: XG Boost**

In [None]:
# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(n_estimators=200, random_state=23, use_label_encoder=False, eval_metric='logloss')

# Fit the model on the training data
xgb_clf.fit(train_X, train_Y)

# Make predictions on the test data
pred_Y_xgb = xgb_clf.predict(test_X)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(test_Y, pred_Y_xgb))

print("\nClassification Report:")
print(classification_report(test_Y, pred_Y_xgb))

scorexgb = accuracy_score(test_Y, pred_Y_xgb)
print('xgBoost Classifier Model Accuracy: {:.2%}'.format(scorexgb))

# Display ROC curve
pred_prob_xgb = xgb_clf.predict_proba(test_X)[:, 1]
fpr, tpr, _ = metrics.roc_curve(test_Y, pred_prob_xgb, pos_label=xgb_clf.classes_[1])
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='processing estimator')
display.plot()
plt.show()

# Display feature importance for XGBoost classifier
xgb_model = xgb.XGBClassifier()
selector = SelectKBest(score_func=f_classif, k=10)
pipeline = Pipeline([
    ('feature_selection', selector),
    ('classification', xgb_model)
])

pipeline.fit(train_X, train_Y)
scores = selector.scores_
feature_names = train_X.columns
print("Feature Scores:")
for score, feature in sorted(zip(scores, feature_names), reverse=True):
    print(f"Feature: {feature}, Score: {score}")
print(" ")



# Average Accuracy of each Model

**Average Model Accuracy test:**

In [None]:
#Take Average Accuracy of each of the 6 models
#Initialize variables and lists

ScoreAvgLog = 0
ScoreAvgRF = 0
ScoreAvgSVM = 0
ScoreAvgDT = 0
ScoreAvgR = 0
ScoreAvgKN = 0
ScoreAvgXGB = 0

for i in range(15):
    randomState = random.randint(1, 100)
    train_X_test, test_X_test, train_Y_test, test_Y_test = train_test_split(final_X, final_Y, test_size=0.15, random_state=randomState)

    # Scale the data
    scaler = StandardScaler()
    train_X_test = scaler.fit_transform(train_X_test)
    test_X_test = scaler.transform(test_X_test)

    # Logistic Regression
    log_reg_test = LogisticRegression(solver='liblinear', random_state=randomState, max_iter=1000)
    log_reg_test.fit(train_X_test, train_Y_test)
    pred_Y_log_test = log_reg_test.predict(test_X_test)
    scorelog = accuracy_score(test_Y_test, pred_Y_log_test)
    ScoreAvgLog += int(scorelog*100)

    # Random Forest
    rf_clf_test = RandomForestClassifier(n_estimators=200,  random_state=randomState)
    rf_clf_test.fit(train_X_test, train_Y_test)
    pred_Y_rf_test = rf_clf_test.predict(test_X_test)
    scoreRF = accuracy_score(test_Y_test, pred_Y_rf_test)
    ScoreAvgRF += int(scoreRF*100)

    # SVM
    svm_clf_test = SVC(kernel='linear',  random_state=randomState)
    svm_clf_test.fit(train_X_test, train_Y_test)
    pred_Y_svm_test = svm_clf_test.predict(test_X_test)
    scoreSVM = accuracy_score(test_Y_test, pred_Y_svm_test)
    ScoreAvgSVM += int(scoreSVM*100)

    # Decision Tree
    dt_clf_test = DecisionTreeClassifier(random_state=randomState)
    dt_clf_test.fit(train_X_test, train_Y_test)
    pred_Y_dt_test = dt_clf_test.predict(test_X_test)
    scoreDT = accuracy_score(test_Y_test, pred_Y_dt_test)
    ScoreAvgDT += int(scoreDT*100)

    #Ridge Classifiwr
    r_clf_test = RidgeClassifier(random_state=randomState)
    r_clf_test.fit(train_X, train_Y)
    pred_Y_r_test = r_clf_test.predict(test_X)
    scoreR = accuracy_score(test_Y_test, pred_Y_r_test)
    ScoreAvgR += int(scoreR*100)

    #KNeighbors Classifier
    kN_clf_test = KNeighborsClassifier(n_neighbors=3)
    kN_clf_test.fit(train_X_test, train_Y_test)
    pred_Y_KN_test = kN_clf_test.predict(test_X_test)
    scoreKN = accuracy_score(test_Y_test, pred_Y_KN_test)
    ScoreAvgKN += int(scoreKN*100)

    #XG Boost
    xgb_clf_test = xgb.XGBClassifier(n_estimators=200, random_state=23, use_label_encoder=False, eval_metric='logloss')
    xgb_clf_test.fit(train_X_test, train_Y_test)
    pred_Y_xgb_test = xgb_clf_test.predict(test_X_test)
    scorexgb = accuracy_score(test_Y_test, pred_Y_xgb_test)
    ScoreAvgXGB += int(scorexgb*100)

#Print Results
accuracies = {
    'Random Forest': ScoreAvgRF / 1500,
    'Logistic Regression': ScoreAvgLog / 1500,
    'SVM': ScoreAvgSVM / 1500,
    'Decision Tree': ScoreAvgDT / 1500,
    'Ridge Classifier': ScoreAvgR / 1500,
    'KN Classifier': ScoreAvgKN / 1500,
    'XG Boost': ScoreAvgXGB / 1500
}

sorted_accuracies = sorted(accuracies.items(), key=lambda x: x[1], reverse=True)
rank = 1
for model, accuracy in sorted_accuracies:
    print(str(rank) + ". " + f'{model} Model Accuracy: {accuracy:.2%}')
    rank+=1


# Neural Networks

**Sckit Neural Network Classification model:**

In [None]:
ScoresNN = []

for i in range(5):
  randomState = random.randint(1, 100)
  X_trainNN, X_testNN, Y_trainNN, Y_testNN = train_test_split(final_X, final_Y, test_size=0.15, random_state=randomState)

  scaler = StandardScaler()
  X_trainNN = scaler.fit_transform(X_trainNN)
  X_testNN = scaler.transform(X_testNN)

  modelNN = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, activation='relu', solver='adam', learning_rate_init=0.001, alpha=0.001, batch_size=10)

  modelNN.fit(X_trainNN, Y_trainNN)
  predictNN = modelNN.predict(X_testNN)

  scoreNN = accuracy_score(Y_testNN, predictNN )
  ScoresNN.append((int(scoreNN*100)))

  #Display ROC curve
  pred_prob_NN = modelNN.predict_proba(X_testNN)[:, 1]
  fpr, tpr, _ = metrics.roc_curve(Y_testNN, pred_prob_NN)
  roc_auc = metrics.auc(fpr, tpr)
  display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='processing estimator')
  display.plot()
  plt.show()

average_accuracy = sum(ScoresNN) / len(ScoresNN)
print(f'Average Accuracy: {average_accuracy:.2f}')


**TensorFlow Neural Network Classification model:**

In [None]:
ScoresNNtf = []

for i in range(5):
    randomState = random.randint(1, 100)
    X_trainNNtf, X_testNNtf, Y_trainNNtf, Y_testNNtf = train_test_split(final_X, final_Y, test_size=0.15, random_state=randomState)

    scaler = StandardScaler()
    X_trainNNtf = scaler.fit_transform(X_trainNNtf)
    X_testNNtf = scaler.transform(X_testNNtf)

    Y_trainNNtf = to_categorical(Y_trainNNtf)
    Y_testNNtf = to_categorical(Y_testNNtf)

    modelNNtf = Sequential()
    modelNNtf.add(Dense(480, activation='relu', kernel_regularizer=l2(0.01), input_shape=(int(X_trainNNtf.shape[1]),)))
    modelNNtf.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
    modelNNtf.add(Dense(2, activation='softmax'))

    modelNNtf.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])

    modelNNtf.fit(X_trainNNtf, Y_trainNNtf, epochs=200, batch_size=10, verbose=0)
    evaluate = modelNNtf.evaluate(X_testNNtf, Y_testNNtf)

    ScoresNNtf.append(evaluate[1])

    #Display ROC curve
    pred_prob_NNtf = modelNNtf.predict(X_testNNtf)[:, 1]
    fpr, tpr, _ = metrics.roc_curve(Y_testNNtf[:, 1], pred_prob_NNtf)
    roc_auc = metrics.auc(fpr, tpr)
    display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='processing estimator')
    display.plot()
    plt.show()

average_accuracy = sum(ScoresNNtf) / len(ScoresNNtf)
print(f'Average Accuracy: {average_accuracy:.2f}')


# Testing Best Models w/ different Features

**Test Models with only the test Scores**

In [None]:
#Create new dataframe with only Autism test scores data (testing to see if it performs better with only cognitive scores)
dfAutismScores = autism_data.iloc[:,:10]
dfAutismScores.head()
train_XScores, test_XScores, train_YScores, test_YScores = train_test_split(dfAutismScores, final_Y, test_size=0.15, random_state=23)

In [None]:
#Logistic Regression with new dataframe

log_regScores = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=23)
log_regScores.fit(train_XScores, train_YScores)
pred_Y_log_Scores = log_regScores.predict(test_XScores)
print("Confusion Matrix:")
print(confusion_matrix(test_YScores, pred_Y_log_Scores))

print("\nClassification Report:")
print(classification_report(test_YScores, pred_Y_log_Scores))

scorelog_Scores = accuracy_score(test_YScores, pred_Y_log_Scores)
print('Logistic Regression Model Accuracy: {:.2%}'.format(scorelog_Scores))

#Display ROC curve
pred_prob_logScores = log_regScores.predict_proba(test_XScores)[:, 1]
fpr, tpr, _ = metrics.roc_curve(test_YScores, pred_prob_logScores, pos_label=log_regScores.classes_[1])
#roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='processing estimator')
display.plot()
plt.show()

In [None]:
#Random Forest with new dataframe

rf_clfScores = RandomForestClassifier(n_estimators=200, random_state=23)
rf_clfScores.fit(train_XScores, train_YScores)
pred_Y_rfScores = rf_clfScores.predict(test_XScores)

print("Confusion Matrix:")
print(confusion_matrix(test_YScores, pred_Y_rfScores))

print("\nClassification Report:")
print(classification_report(test_YScores, pred_Y_rfScores))

scorerfScores = accuracy_score(test_YScores, pred_Y_rfScores)
print('Random Forest Model Accuracy: {:.2%}'.format(scorerfScores))

#Display ROC curve
pred_prob_rfScores = rf_clfScores.predict_proba(test_XScores)[:, 1]
fpr, tpr, _ = metrics.roc_curve(test_YScores, pred_prob_rfScores, pos_label=rf_clfScores.classes_[1])
#roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='processing estimator')
display.plot()
plt.show()


**Having only Autism cognitive scores data yields worse results from the same models**