# ML Olympiad - Autism Prediction Challenge - By: Kaggle

<h3>Objective: </h3>
Improve Autism Screening by creating predicting the likelihood of having this condition.

<h3>Causes and Challenges:</h3>
It is mostly influenced by a combination of genetic and environmental factors. Because autism is a spectrum disorder, each person with autism has a distinct set of strengths and challenges. The ways in which people with autism learn, think and problem-solve can range from highly skilled to severely challenged. Research has made clear that high quality early intervention can improve learning, communication and social skills, as well as underlying brain development. Yet the diagnostic process can take several years.

In [139]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_curve
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

import tensorflow as tf
import keras 

In [97]:
at = pd.read_csv('train.csv')
print(at.shape)
at.head()

(800, 22)


Unnamed: 0,ID,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,...,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,0,1,1,1,1,0,1,1,...,f,White-European,no,no,United States,no,7.819715,18 and more,Self,0
1,2,0,0,0,0,0,0,0,0,0,...,f,South Asian,no,no,Australia,no,10.544296,18 and more,?,0
2,3,1,1,1,1,1,1,0,0,1,...,f,White-European,no,no,United Kingdom,no,13.167506,18 and more,Self,1
3,4,0,0,0,1,0,0,0,0,0,...,f,South Asian,no,no,New Zealand,no,1.530098,18 and more,?,0
4,5,0,0,0,0,1,0,0,0,1,...,m,Black,no,yes,Italy,no,7.949723,18 and more,Self,0


In [98]:
at.drop(['ID'], axis = 1, inplace = True)

In [99]:
at.isnull().sum()

A1_Score           0
A2_Score           0
A3_Score           0
A4_Score           0
A5_Score           0
A6_Score           0
A7_Score           0
A8_Score           0
A9_Score           0
A10_Score          0
age                0
gender             0
ethnicity          0
jaundice           0
austim             0
contry_of_res      0
used_app_before    0
result             0
age_desc           0
relation           0
Class/ASD          0
dtype: int64

In [100]:
# We observe that even though there are no nulls we have '?' in place of missing values

at.columns[at.isin({'?'}).sum() > 0] # Columns with ? in rows

Index(['ethnicity', 'relation'], dtype='object')

In [101]:
# Lets convert ? to Null

at = at.replace({'?' : np.nan})

In [102]:
at.isnull().sum() # We can now see the missing value count

A1_Score             0
A2_Score             0
A3_Score             0
A4_Score             0
A5_Score             0
A6_Score             0
A7_Score             0
A8_Score             0
A9_Score             0
A10_Score            0
age                  0
gender               0
ethnicity          151
jaundice             0
austim               0
contry_of_res        0
used_app_before      0
result               0
age_desc             0
relation            77
Class/ASD            0
dtype: int64

In [103]:
print('Ethnicity NaNs :\n ',at['ethnicity'].value_counts()) # we can replace 
print('\n Relation NaNs :\n ',at['relation'].value_counts()) # we can replace self for nulls in Relation col

Ethnicity NaNs :
  White-European     211
Asian              134
Middle Eastern     116
Black               45
Latino              44
South Asian         35
Others              24
Pasifika            18
Hispanic            16
Turkish              4
others               2
Name: ethnicity, dtype: int64

 Relation NaNs :
  Self                        617
Parent                       49
Relative                     43
Health care professional      7
Others                        7
Name: relation, dtype: int64


In [104]:
# Replace the Nulls with the above observed logical values

at['relation'] = at['relation'].fillna('Self')
at['ethnicity'] = at['ethnicity'].fillna('others')

In [105]:
at.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         800 non-null    int64  
 1   A2_Score         800 non-null    int64  
 2   A3_Score         800 non-null    int64  
 3   A4_Score         800 non-null    int64  
 4   A5_Score         800 non-null    int64  
 5   A6_Score         800 non-null    int64  
 6   A7_Score         800 non-null    int64  
 7   A8_Score         800 non-null    int64  
 8   A9_Score         800 non-null    int64  
 9   A10_Score        800 non-null    int64  
 10  age              800 non-null    float64
 11  gender           800 non-null    object 
 12  ethnicity        800 non-null    object 
 13  jaundice         800 non-null    object 
 14  austim           800 non-null    object 
 15  contry_of_res    800 non-null    object 
 16  used_app_before  800 non-null    object 
 17  result          

In [106]:
col = at.columns

for i in col:
    if(at[i].dtype == 'object'):
        print(at[i].value_counts())
        print('++++++++++++++++++++++++++++++++++++++++')

f    415
m    385
Name: gender, dtype: int64
++++++++++++++++++++++++++++++++++++++++
White-European     211
others             153
Asian              134
Middle Eastern     116
Black               45
Latino              44
South Asian         35
Others              24
Pasifika            18
Hispanic            16
Turkish              4
Name: ethnicity, dtype: int64
++++++++++++++++++++++++++++++++++++++++
no     604
yes    196
Name: jaundice, dtype: int64
++++++++++++++++++++++++++++++++++++++++
no     683
yes    117
Name: austim, dtype: int64
++++++++++++++++++++++++++++++++++++++++
United States           148
United Arab Emirates     94
New Zealand              93
India                    74
United Kingdom           58
                       ... 
Indonesia                 1
Tonga                     1
Iraq                      1
Finland                   1
Niger                     1
Name: contry_of_res, Length: 61, dtype: int64
++++++++++++++++++++++++++++++++++++++++
no     765
ye

In [107]:
# Apply Label Encoder to all Object type columns 

le = LabelEncoder()

at[at.select_dtypes(include = ['object']).columns] = at[at.select_dtypes(include = ['object']).columns].apply(le.fit_transform)

In [108]:
at.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         800 non-null    int64  
 1   A2_Score         800 non-null    int64  
 2   A3_Score         800 non-null    int64  
 3   A4_Score         800 non-null    int64  
 4   A5_Score         800 non-null    int64  
 5   A6_Score         800 non-null    int64  
 6   A7_Score         800 non-null    int64  
 7   A8_Score         800 non-null    int64  
 8   A9_Score         800 non-null    int64  
 9   A10_Score        800 non-null    int64  
 10  age              800 non-null    float64
 11  gender           800 non-null    int32  
 12  ethnicity        800 non-null    int32  
 13  jaundice         800 non-null    int32  
 14  austim           800 non-null    int32  
 15  contry_of_res    800 non-null    int32  
 16  used_app_before  800 non-null    int32  
 17  result          

In [109]:
# Lets Balance the Classes to get proper Result based on both kinds of data

at_label = pd.DataFrame(at[at['Class/ASD'] == 1]) 
at_label2 = pd.DataFrame(at[at['Class/ASD'] == 1]) 

at = pd.concat([at, at_label, at_label2])

In [110]:
at_x = at.drop(['Class/ASD'], axis =1 )
at_y = at['Class/ASD']

at_x_train, at_x_test, at_y_train, at_y_test = train_test_split(at_x, at_y, test_size=.2)
at_x.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,age_desc,relation
0,1,0,1,1,1,1,0,1,1,1,18.605397,0,9,0,0,58,0,7.819715,0,4
1,0,0,0,0,0,0,0,0,0,1,13.829369,0,7,0,0,6,0,10.544296,0,4
2,1,1,1,1,1,1,0,0,1,1,14.679893,0,9,0,0,57,0,13.167506,0,4
3,0,0,0,1,0,0,0,0,0,0,61.035288,0,7,0,0,39,0,1.530098,0,4
4,0,0,0,0,1,0,0,0,1,1,14.256686,1,1,0,1,32,0,7.949723,0,4


In [111]:
model_name_lst = []

AccuracyScore = []
PrecisionScore = []
Recall_Score = []
F1_score = []
Roc_Scr = []


def model_scores(model_name, model_obj):
    
    pred_y = model_obj.predict(at_x_test)
    
#     print(pred_y)
    
#     ConfusionMatrix = confusion_matrix(pr_test_y, pr_pred_y) 
#     print('ConfusionMatrix')
#     print(ConfusionMatrix,'\n')

    AccuracyScore.append(np.round((model_obj.score(at_x_test, at_y_test))*100))

    PrecisionScore.append(((precision_score(at_y_test, pred_y))*100).round(2))

    Recall_Score.append(((recall_score(at_y_test, pred_y))*100).round(2))

    F1_score.append(((f1_score(at_y_test, pred_y))*100).round(2))

    Roc_Scr.append(((roc_auc_score(at_y_test, pred_y))*100).round(2))

    model_name_lst.append(model_name)


<h3>Logistic Regression classification</h3>

In [112]:
logreg = LogisticRegression(max_iter  =500)
logreg.fit(at_x_train, at_y_train)

pred_y = logreg.predict(at_x_test)

model_scores('Logistic Reg', logreg)

<h3>Decision Tree Classifier</h3>

In [113]:
dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(at_x_train, at_y_train)
model_scores('Decision Tree', dt)

<h3>Random Forest Classifier</h3>

In [114]:
rf = RandomForestClassifier(n_estimators=100, max_depth=30, min_samples_split=2)
rf.fit(at_x_train, at_y_train)
model_scores('Random Forest', rf)

<h3>K nearest Neigbours Classifier</h3>

In [115]:
kn = KNeighborsClassifier(n_neighbors=22)
kn.fit(at_x_train, at_y_train)
model_scores('KNearestNeigbour', kn)

<h3>Support Vector Classifier</h3>

In [116]:
svc = SVC(kernel = 'poly', degree=10)
svc.fit(at_x_train, at_y_train)
model_scores('SVC',svc)

<h3>XG Boost Classifier</h3>

In [117]:
xgbr = XGBClassifier(verbosity = 0)
xgbr.fit(at_x_train, at_y_train)
model_scores('XGBoost', xgbr) 



In [118]:
Scores = pd.DataFrame({'Accuracy Score' : AccuracyScore, 'Precision Score': PrecisionScore, 'Recall Score': Recall_Score, 'F1 score': F1_score, 'Roc Scr':Roc_Scr}, index= [model_name_lst])
df = Scores.sort_values('Precision Score', ascending=False)
df

Unnamed: 0,Accuracy Score,Precision Score,Recall Score,F1 score,Roc Scr
Random Forest,96.0,94.66,97.64,96.12,95.55
XGBoost,97.0,94.07,100.0,96.95,96.26
SVC,77.0,93.98,61.42,74.29,78.37
Logistic Reg,89.0,89.76,89.76,89.76,88.81
Decision Tree,90.0,86.11,97.64,91.51,89.47
KNearestNeigbour,76.0,85.29,68.5,75.98,77.24


<h2> Now lets try using Deep Learning Techniques to predict Autism </h2>

In [119]:
# Noramlize the Independent Variable to get better results

at_x_train = tf.keras.utils.normalize(at_x_train)
at_x_test = tf.keras.utils.normalize(at_x_test)

# Convert Dataframe to Array in order to convert use Tensorflow
at_x_train = np.array(at_x_train)
at_x_test = np.array(at_x_test)
at_y_train = np.array(at_y_train)

In [120]:
model = tf.keras.models.Sequential() # modelis initialized
model.add(tf.keras.layers.Dense(128, activation= tf.nn.relu)) # 1st Hidden Layer
model.add(tf.keras.layers.Dense(128, activation= tf.nn.relu)) # 2st Hidden Layer
model.add(tf.keras.layers.Dense(128, activation= tf.nn.relu)) # 3st Hidden Layer
model.add(tf.keras.layers.Dense(2, activation= tf.nn.softmax)) # 10 is not a hyper parameter
# number of layers in target variable

In [121]:
# 'adam' optimizer (like the GD)

model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [123]:
model.fit(at_x_train, at_y_train, epochs=50, validation_split=.2, batch_size = 64)

In [124]:
y_pred = model.predict(at_x_test)
y_pred = np.argmax(y_pred, axis =1)
y_pred

array([0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1], dtype=int64)

In [125]:
from sklearn.metrics import classification_report
tab = confusion_matrix(at_y_test, y_pred)
print('Confusion Matrix : \n Actual Values -> Predicted Values\n \n',tab)
print('\n',classification_report(at_y_test, y_pred))

Confusion Matrix : 
 Actual Values -> Predicted Values
 
 [[ 92  15]
 [ 15 112]]

               precision    recall  f1-score   support

           0       0.86      0.86      0.86       107
           1       0.88      0.88      0.88       127

    accuracy                           0.87       234
   macro avg       0.87      0.87      0.87       234
weighted avg       0.87      0.87      0.87       234



In [126]:
df

Unnamed: 0,Accuracy Score,Precision Score,Recall Score,F1 score,Roc Scr
Random Forest,96.0,94.66,97.64,96.12,95.55
XGBoost,97.0,94.07,100.0,96.95,96.26
SVC,77.0,93.98,61.42,74.29,78.37
Logistic Reg,89.0,89.76,89.76,89.76,88.81
Decision Tree,90.0,86.11,97.64,91.51,89.47
KNearestNeigbour,76.0,85.29,68.5,75.98,77.24


In [127]:
# We've have managed to achieve High precision with Random Forest 

# Hence, we will now train our entire model with complete train.csv and predict on test.csv

In [128]:
rf1 = RandomForestClassifier(n_estimators=100, max_depth=30, min_samples_split=2)
rf1.fit(at_x, at_y)

RandomForestClassifier(max_depth=30)

In [129]:
svc = SVC(kernel = 'poly', degree=10)
svc.fit(at_x, at_y)

SVC(degree=10, kernel='poly')

In [130]:
xgbr = XGBClassifier(verbosity = 0)
xgbr.fit(at_x, at_y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0)

In [131]:
logreg = LogisticRegression(max_iter  = 500)
logreg.fit(at_x, at_y)

LogisticRegression(max_iter=500)

In [132]:
tf = pd.read_csv('test.csv')

tf1 = tf.copy()
tf = tf.drop('ID', axis=1)

In [133]:
tf.columns[tf.isin({'?'}).sum() > 0] 

Index(['ethnicity', 'relation'], dtype='object')

In [134]:
tf = tf.replace({'?' : np.nan})

In [135]:
tf['relation'] = tf['relation'].fillna('Self')
tf['ethnicity'] = tf['ethnicity'].fillna('others')

In [136]:
# Apply Label Encoder to all Object type columns 

le = LabelEncoder()

tf[tf.select_dtypes(include = ['object']).columns] = tf[tf.select_dtypes(include = ['object']).columns].apply(le.fit_transform)

In [137]:
tf.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,age_desc,relation
0,0,0,0,0,0,0,0,1,0,0,13.445319,1,0,0,0,40,0,-0.914467,0,4
1,1,0,0,0,1,0,0,0,0,0,25.057229,0,10,1,0,29,0,4.553447,0,2
2,1,0,0,1,0,1,0,1,0,0,28.799885,0,0,0,0,6,0,-1.581115,0,4
3,1,1,1,0,1,0,1,0,1,1,16.501526,0,4,1,0,8,0,11.77921,0,3
4,1,0,0,1,1,0,0,1,1,0,54.223869,0,9,0,0,5,0,10.717321,0,4


In [138]:
y_pred = logreg.predict(tf)

y_pred

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1], dtype=int64)

In [94]:
import csv 
    
# field names 
fields = ['ID', 'Class/ASD'] 
    
# name of csv file 
filename = "Predicted_Values_LogReg.csv"
    
# writing to csv file 
with open(filename, 'a', newline='') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile, dialect='excel') 
        
    # writing the fields 
    csvwriter.writerow(fields) 
        
    # writing the data rows 
    for w in range(0, len(y_pred)):
        csvwriter.writerow([w+1, y_pred[w]])