# Collecting some baselines from various models

In [224]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

%matplotlib inline

#### Display metrics using show_results()

In [225]:
def show_results(test_type, accuracy, cross_val, conf_mat):
    print()
    print(test_type)
    print('Accuracy:\t\t\t', accuracy)
    print('Cross Validation Results:\t', cross_val)
    print('Cross Validation Mean:\t\t', np.mean(cross_val))
    print('Confusion Matrix:\n', conf_mat)

### Load csv into dataframe 

In [226]:
df = pd.read_csv('Autism_Data.csv')
df.columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 
              'A7', 'A8', 'A9', 'A10', 'age', 'gender',
             'ethnicity', 'jundice', 'autism', 'country', 
             'app', 'result', 'age_category', 'relation',
             'app_prediction']
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,gender,ethnicity,jundice,autism,country,app,result,age_category,relation,app_prediction
0,1,1,1,1,0,0,1,1,0,0,...,f,White-European,no,no,United States,no,6,18 and more,Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,m,Latino,no,yes,Brazil,no,5,18 and more,Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,m,Latino,yes,yes,Spain,no,8,18 and more,Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,f,White-European,no,yes,United States,no,6,18 and more,Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,f,?,no,no,Egypt,no,2,18 and more,?,NO


## Convert yes/no colums to 1/0 columns

In [227]:
gender_row = pd.Series(np.where(df['gender'] == 'm', 1, 0))
jund_row = pd.Series(np.where(df['jundice'] == 'yes', 1, 0))
autism_row = pd.Series(np.where(df['autism'] == 'yes', 1, 0))

df['gender'] = gender_row
df['jundice'] = jund_row
df['autism'] = autism_row

df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,gender,ethnicity,jundice,autism,country,app,result,age_category,relation,app_prediction
0,1,1,1,1,0,0,1,1,0,0,...,0,White-European,0,0,United States,no,6,18 and more,Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,1,Latino,0,1,Brazil,no,5,18 and more,Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,1,Latino,1,1,Spain,no,8,18 and more,Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,0,White-European,0,1,United States,no,6,18 and more,Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,0,?,0,0,Egypt,no,2,18 and more,?,NO


## Establish X, y datasets.  
### Set global paramaters for models found below. 

In [228]:
X = df[['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'gender', 'jundice']]
y = df['autism']

test_size = 0.3
random_state = 1
alpha = 1e-5
cv = 3

# solver = {sgd, adam, lbfgs}
solver = 'lbfgs'

# hidden_layer_sizes = tuple, ith element is number of neurons in ith layer.
# best results with {(30, 20, 10), (20, 30, 20, 10)}
hidden_layer_sizes = (30, 20, 10)

orig_text = 'Original data'
smote_text = 'Preprocessed with SMOTE'

# Dicts to keep track of scores
orig_dict = {}
smote_dict = {}

## Multi Layer Perceptron Classifier (original data)

In [229]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

clf = MLPClassifier(solver=solver, 
                    alpha=alpha, 
                    hidden_layer_sizes=hidden_layer_sizes, 
                    random_state=1)

clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)
orig_dict['MLP'] = accuracy
cross_val = cross_val_score(clf, X, y, cv=cv)
conf_mat = confusion_matrix(y_test, clf.predict(X_test))

show_results(orig_text, accuracy, cross_val, conf_mat)


Original data
Accuracy:			 0.8490566037735849
Cross Validation Results:	 [0.77118644 0.79059829 0.79059829]
Cross Validation Mean:		 0.7841276739581824
Confusion Matrix:
 [[175  19]
 [ 13   5]]


## Multi Layer Perceptron Classifier (data after SMOTE)

In [230]:
X_smote, y_smote = SMOTE().fit_sample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.3, random_state=101)

clf = MLPClassifier(solver=solver, 
                    alpha=alpha, 
                    hidden_layer_sizes=hidden_layer_sizes, 
                    random_state=1)

clf.fit(X_train, y_train)

smote_accuracy = clf.score(X_test, y_test)
smote_dict['MLP'] = smote_accuracy
cross_val = cross_val_score(clf, X_smote, y_smote, cv=cv)
conf_mat = confusion_matrix(y_test, clf.predict(X_test))

show_results(smote_text, accuracy, cross_val, conf_mat)


Preprocessed with SMOTE
Accuracy:			 0.8490566037735849
Cross Validation Results:	 [0.81707317 0.86764706 0.87009804]
Cross Validation Mean:		 0.8516060895903076
Confusion Matrix:
 [[159  40]
 [ 17 152]]


## Support Vector Machine (Original data)

In [231]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

clf_svm = svm.SVC()
clf_svm.fit(X_train, y_train)

svm_accuracy = clf_svm.score(X_test, y_test)
orig_dict['SVM'] = svm_accuracy
cross_val_svm = cross_val_score(clf_svm, X, y, cv=cv)
conf_mat_svm = confusion_matrix(y_test, clf_svm.predict(X_test))

show_results(orig_text, svm_accuracy, cross_val_svm, conf_mat_svm)


Original data
Accuracy:			 0.9150943396226415
Cross Validation Results:	 [0.86864407 0.87179487 0.87179487]
Cross Validation Mean:		 0.8707446037954513
Confusion Matrix:
 [[194   0]
 [ 18   0]]


## Support Vector Machine (with SMOTE)

In [232]:
X_smote, y_smote = SMOTE().fit_sample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.3, random_state=101)

clf_svm_smote = svm.SVC()
clf_svm_smote.fit(X_train, y_train)

svm_accuracy = clf_svm_smote.score(X_test, y_test)
smote_dict['SVM'] = svm_accuracy
cross_val_svm = cross_val_score(clf_svm_smote, X_smote, y_smote, cv=cv)
conf_mat_svm = confusion_matrix(y_test, clf_svm_smote.predict(X_test))

show_results(smote_text, svm_accuracy, cross_val_svm, conf_mat_svm)


Preprocessed with SMOTE
Accuracy:			 0.7336956521739131
Cross Validation Results:	 [0.76097561 0.78186275 0.72058824]
Cross Validation Mean:		 0.7544755300494183
Confusion Matrix:
 [[124  75]
 [ 23 146]]


## KNN (original data)

In [233]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

knn_accuracy = knn.score(X_test, y_test)
orig_dict['KNN'] = knn_accuracy
cross_val_knn = cross_val_score(knn, X, y, cv=cv)
conf_mat_knn = confusion_matrix(y_test, knn.predict(X_test))

show_results(orig_text, knn_accuracy, cross_val_knn, conf_mat_knn)


Original data
Accuracy:			 0.8820754716981132
Cross Validation Results:	 [0.83474576 0.85042735 0.85042735]
Cross Validation Mean:		 0.8452001545221884
Confusion Matrix:
 [[186   8]
 [ 17   1]]


## KNN (with SMOTE)

In [234]:
X_smote, y_smote = SMOTE().fit_sample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.3, random_state=101)

knn_smote = KNeighborsClassifier(n_neighbors=3)
knn_smote.fit(X_train, y_train)

knn_accuracy = knn_smote.score(X_test, y_test)
smote_dict['KNN'] = knn_accuracy
cross_val_knn = cross_val_score(knn_smote, X_smote, y_smote, cv=cv)
conf_mat_knn = confusion_matrix(y_test, knn_smote.predict(X_test))

show_results(smote_text, knn_accuracy, cross_val_knn, conf_mat_knn)


Preprocessed with SMOTE
Accuracy:			 0.7880434782608695
Cross Validation Results:	 [0.79512195 0.80392157 0.78921569]
Cross Validation Mean:		 0.796086402040491
Confusion Matrix:
 [[137  62]
 [ 16 153]]


## Guassian Process Classifier (original data)

In [235]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

gpc = GaussianProcessClassifier()
gpc.fit(X_train, y_train)

gpc_accuracy = gpc.score(X_test, y_test)
orig_dict['GPC'] = gpc_accuracy
cross_val_gpc = cross_val_score(gpc, X, y, cv=cv)
conf_mat_gpc = confusion_matrix(y_test, gpc.predict(X_test))

show_results(orig_text, gpc_accuracy, cross_val_gpc, conf_mat_gpc)


Original data
Accuracy:			 0.9150943396226415
Cross Validation Results:	 [0.86864407 0.87179487 0.87179487]
Cross Validation Mean:		 0.8707446037954513
Confusion Matrix:
 [[194   0]
 [ 18   0]]


## Guassian Process Classifier (with SMOTE)

In [236]:
X_smote, y_smote = SMOTE().fit_sample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.3, random_state=101)

gpc_smote = GaussianProcessClassifier()
gpc_smote.fit(X_train, y_train)

gpc_accuracy = gpc_smote.score(X_test, y_test)
smote_dict['GPC'] = gpc_accuracy
cross_val_gpc = cross_val_score(gpc_smote, X_smote, y_smote, cv=cv)
conf_mat_gpc = confusion_matrix(y_test, gpc_smote.predict(X_test))

show_results(smote_text, gpc_accuracy, cross_val_gpc, conf_mat_gpc)


Preprocessed with SMOTE
Accuracy:			 0.7853260869565217
Cross Validation Results:	 [0.77317073 0.80147059 0.80147059]
Cross Validation Mean:		 0.7920373027259684
Confusion Matrix:
 [[137  62]
 [ 17 152]]


## Logistic Regression (original data)

In [237]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

lgr = LogisticRegression()
lgr.fit(X_train, y_train)

lgr_acc = lgr.score(X_test, y_test)
orig_dict['lgr'] = lgr_acc
cross_val_lgr = cross_val_score(lgr, X, y, cv=cv)
conf_mat_lgr = confusion_matrix(y_test, lgr.predict(X_test))

show_results(orig_text, lgr_acc, cross_val_lgr, conf_mat_lgr)


Original data
Accuracy:			 0.9150943396226415
Cross Validation Results:	 [0.86864407 0.85470085 0.88034188]
Cross Validation Mean:		 0.8678956009464484
Confusion Matrix:
 [[193   1]
 [ 17   1]]


## Logistic Regression after SMOTE

In [238]:
X_smote, y_smote = SMOTE().fit_sample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.3, random_state=101)


lgr_smote = LogisticRegression()
lgr_smote.fit(X_smote, y_smote)

lgr_acc_smote = lgr_smote.score(X_test, y_test)
smote_dict['lgr'] = lgr_acc_smote
cross_val_lgr = cross_val_score(lgr_smote, X_smote, y_smote, cv=cv)
conf_mat_lgr = confusion_matrix(y_test, lgr_smote.predict(X_test))

show_results(smote_text, lgr_acc_smote, cross_val_lgr, conf_mat_lgr)


Preprocessed with SMOTE
Accuracy:			 0.6766304347826086
Cross Validation Results:	 [0.6902439  0.67647059 0.66176471]
Cross Validation Mean:		 0.6761597321855571
Confusion Matrix:
 [[123  76]
 [ 43 126]]


## Comparing results

In [239]:
print('without smote:')
for key, val in orig_dict.items():
    print(key, ':  ', val)

print('\n\nwith smote:')
for key, val in smote_dict.items():
    print(key, ':  ', val)


without smote:
MLP :   0.8490566037735849
SVM :   0.9150943396226415
KNN :   0.8820754716981132
GPC :   0.9150943396226415
lgr :   0.9150943396226415


with smote:
MLP :   0.845108695652174
SVM :   0.7336956521739131
KNN :   0.7880434782608695
GPC :   0.7853260869565217
lgr :   0.6766304347826086
