# Collecting some baselines from various models

In [20]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

%matplotlib inline

#### Display metrics using show_results()

In [21]:
def show_results(meanVar, conf_mat, SMOTE=False):
    if SMOTE:
        print('Preprocessed with SMOTE')
    else:
        print('Original data')
    print('Mean Accuracy: \t\t', meanVar[0])
    print('Variance from Cross Validation: \t', meanVar[1])
    print('Confusion Matrix: \n', conf_mat)

### Load csv into dataframe 

In [22]:
df = pd.read_csv('Autism_Data.csv')
df.columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 
              'A7', 'A8', 'A9', 'A10', 'age', 'gender',
             'ethnicity', 'jundice', 'autism', 'country', 
             'app', 'result', 'age_category', 'relation',
             'app_prediction']
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,gender,ethnicity,jundice,autism,country,app,result,age_category,relation,app_prediction
0,1,1,1,1,0,0,1,1,0,0,...,f,White-European,no,no,United States,no,6,18 and more,Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,m,Latino,no,yes,Brazil,no,5,18 and more,Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,m,Latino,yes,yes,Spain,no,8,18 and more,Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,f,White-European,no,yes,United States,no,6,18 and more,Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,f,?,no,no,Egypt,no,2,18 and more,?,NO


## Convert yes/no colums to 1/0 columns

In [23]:
gender_row = pd.Series(np.where(df['gender'] == 'm', 1, 0))
jund_row = pd.Series(np.where(df['jundice'] == 'yes', 1, 0))
autism_row = pd.Series(np.where(df['autism'] == 'yes', 1, 0))

df['gender'] = gender_row
df['jundice'] = jund_row
df['autism'] = autism_row

df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,gender,ethnicity,jundice,autism,country,app,result,age_category,relation,app_prediction
0,1,1,1,1,0,0,1,1,0,0,...,0,White-European,0,0,United States,no,6,18 and more,Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,1,Latino,0,1,Brazil,no,5,18 and more,Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,1,Latino,1,1,Spain,no,8,18 and more,Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,0,White-European,0,1,United States,no,6,18 and more,Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,0,?,0,0,Egypt,no,2,18 and more,?,NO


## Establish X, y datasets.  
### Set global paramaters for models found below. 

In [24]:
X = df[['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'gender', 'jundice']]
y = df['autism']

test_size = 0.3
random_state = 1
alpha = 1e-5
cv = 3

# solver = {sgd, adam, lbfgs}
solver = 'lbfgs'

# hidden_layer_sizes = tuple, ith element is number of neurons in ith layer.
# best results with {(30, 20, 10), (20, 30, 20, 10)}
hidden_layer_sizes = (30, 20, 10)

orig_text = 'Original data'
smote_text = 'Preprocessed with SMOTE'

# Dicts to keep track of scores
orig_dict = {}
smote_dict = {}

# For training on original data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

# For training on data after SMOTE 
X_smote, y_smote = SMOTE().fit_sample(X, y)
sX_train, sX_test, sy_train, sy_test = train_test_split(X_smote, y_smote, test_size=0.3, random_state=101)

## Multi Layer Perceptron Classifier (original data)

In [25]:
mlp = MLPClassifier(solver=solver, 
                    alpha=alpha, 
                    hidden_layer_sizes=hidden_layer_sizes, 
                    random_state=1)

mlp.fit(X_train, y_train)

mlp_cross_val = cross_val_score(mlp, X, y, cv=cv)
mlp_conf_mat = confusion_matrix(y_test, mlp.predict(X_test))

orig_dict['MLP'] = (np.mean(mlp_cross_val), np.var(mlp_cross_val))
show_results(orig_dict['MLP'], mlp_conf_mat)

Original data
Mean Accuracy: 		 0.7869887488531556
Variance from Cross Validation: 	 0.0002768878717148167
Confusion Matrix: 
 [[175  19]
 [ 13   5]]


## Multi Layer Perceptron Classifier (data after SMOTE)

In [26]:
mpl_smote = MLPClassifier(solver=solver, 
                    alpha=alpha, 
                    hidden_layer_sizes=hidden_layer_sizes, 
                    random_state=1)

mpl_smote.fit(sX_train, sy_train)

mlp_cross_val = cross_val_score(mpl_smote, X_smote, y_smote, cv=cv)
mlp_conf_mat = confusion_matrix(sy_test, mpl_smote.predict(sX_test))

smote_dict['MLP'] = (np.mean(mlp_cross_val), np.var(mlp_cross_val))
show_results(smote_dict['MLP'], mlp_conf_mat, SMOTE=True)

Preprocessed with SMOTE
Mean Accuracy: 		 0.863868962219034
Variance from Cross Validation: 	 0.0013991708027609982
Confusion Matrix: 
 [[163  36]
 [  5 164]]


## Support Vector Machine (Original data)

In [27]:
clf_svm = svm.SVC()
clf_svm.fit(X_train, y_train)

svm_cross_val = cross_val_score(clf_svm, X, y, cv=cv)
svm_conf_mat = confusion_matrix(y_test, clf_svm.predict(X_test))

orig_dict['SVM'] = (np.mean(svm_cross_val), np.var(svm_cross_val))
show_results(orig_dict['SVM'], svm_conf_mat)

Original data
Mean Accuracy: 		 0.8707446037954513
Variance from Cross Validation: 	 2.2061257412136467e-06
Confusion Matrix: 
 [[194   0]
 [ 18   0]]


## Support Vector Machine (with SMOTE)

In [28]:
clf_svm_smote = svm.SVC()
clf_svm_smote.fit(sX_train, sy_train)

svm_cross_val = cross_val_score(clf_svm_smote, X_smote, y_smote, cv=cv)
svm_conf_mat = confusion_matrix(sy_test, clf_svm_smote.predict(sX_test))

smote_dict['SVM'] = (np.mean(svm_cross_val), np.var(svm_cross_val))
show_results(smote_dict['SVM'], svm_conf_mat, SMOTE=True)

Preprocessed with SMOTE
Mean Accuracy: 		 0.7324964131994262
Variance from Cross Validation: 	 0.0006906604737130739
Confusion Matrix: 
 [[127  72]
 [ 39 130]]


## KNN (original data)

In [29]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

knn_cross_val = cross_val_score(knn, X, y, cv=cv)
knn_conf_mat = confusion_matrix(y_test, knn.predict(X_test))

orig_dict['KNN'] = (np.mean(knn_cross_val), np.var(knn_cross_val))
show_results(orig_dict['KNN'], knn_conf_mat)

Original data
Mean Accuracy: 		 0.8452001545221884
Variance from Cross Validation: 	 5.464715406188476e-05
Confusion Matrix: 
 [[186   8]
 [ 17   1]]


## KNN (with SMOTE)

In [30]:
knn_smote = KNeighborsClassifier(n_neighbors=3)
knn_smote.fit(sX_train, sy_train)

knn_cross_val = cross_val_score(knn_smote, X_smote, y_smote, cv=cv)
knn_conf_mat = confusion_matrix(sy_test, knn_smote.predict(sX_test))

smote_dict['KNN'] = (np.mean(knn_cross_val), np.var(knn_cross_val))
show_results(smote_dict['KNN'], knn_conf_mat, SMOTE=True)

Preprocessed with SMOTE
Mean Accuracy: 		 0.7969193368404272
Variance from Cross Validation: 	 6.774270429757122e-05
Confusion Matrix: 
 [[131  68]
 [  5 164]]


## Guassian Process Classifier (original data)

In [31]:
gpc = GaussianProcessClassifier()
gpc.fit(X_train, y_train)

gpc_cross_val = cross_val_score(gpc, X, y, cv=cv)
gpc_conf_mat = confusion_matrix(y_test, gpc.predict(X_test))

orig_dict['GPC'] = (np.mean(gpc_cross_val), np.var(gpc_cross_val))
show_results(orig_dict['GPC'], gpc_conf_mat)

Original data
Mean Accuracy: 		 0.8707446037954513
Variance from Cross Validation: 	 2.2061257412136467e-06
Confusion Matrix: 
 [[194   0]
 [ 18   0]]


## Guassian Process Classifier (with SMOTE)

In [32]:
gpc_smote = GaussianProcessClassifier()
gpc_smote.fit(sX_train, sy_train)

gpc_cross_val = cross_val_score(gpc_smote, X_smote, y_smote, cv=cv)
gpc_conf_mat = confusion_matrix(sy_test, gpc_smote.predict(sX_test))

smote_dict['GPC'] = (np.mean(gpc_cross_val), np.var(gpc_cross_val))
show_results(smote_dict['GPC'], gpc_conf_mat, SMOTE=True)

Preprocessed with SMOTE
Mean Accuracy: 		 0.7952813645783516
Variance from Cross Validation: 	 6.399275470985418e-05
Confusion Matrix: 
 [[138  61]
 [ 26 143]]


## Logistic Regression (original data)

In [33]:
lgr = LogisticRegression()
lgr.fit(X_train, y_train)

lgr_cross_val = cross_val_score(lgr, X, y, cv=cv)
lgr_conf_mat = confusion_matrix(y_test, lgr.predict(X_test))

orig_dict['LGR'] = (np.mean(lgr_cross_val), np.var(lgr_cross_val))
show_results(orig_dict['LGR'], lgr_conf_mat)

Original data
Mean Accuracy: 		 0.8678956009464484
Variance from Cross Validation: 	 0.00010985713396685145
Confusion Matrix: 
 [[193   1]
 [ 17   1]]


## Logistic Regression after SMOTE

In [34]:
lgr_smote = LogisticRegression()
lgr_smote.fit(sX_train, sy_train)

lgr_cross_val = cross_val_score(lgr_smote, X_smote, y_smote, cv=cv)
lgr_conf_mat = confusion_matrix(sy_test, lgr_smote.predict(sX_test))

smote_dict['LGR'] = (np.mean(lgr_cross_val), np.var(lgr_cross_val))
show_results(smote_dict['LGR'], lgr_conf_mat, SMOTE=True)

Preprocessed with SMOTE
Mean Accuracy: 		 0.6941415590626495
Variance from Cross Validation: 	 0.00018268181985558552
Confusion Matrix: 
 [[116  83]
 [ 42 127]]


## Comparing results

In [35]:
print('without smote:')
for key, val in orig_dict.items():
    print(key, ':  ', val)

print('\n\nwith smote:')
for key, val in smote_dict.items():
    print(key, ':  ', val)


without smote:
MLP :   (0.7869887488531556, 0.0002768878717148167)
SVM :   (0.8707446037954513, 2.2061257412136467e-06)
KNN :   (0.8452001545221884, 5.464715406188476e-05)
GPC :   (0.8707446037954513, 2.2061257412136467e-06)
LGR :   (0.8678956009464484, 0.00010985713396685145)


with smote:
MLP :   (0.863868962219034, 0.0013991708027609982)
SVM :   (0.7324964131994262, 0.0006906604737130739)
KNN :   (0.7969193368404272, 6.774270429757122e-05)
GPC :   (0.7952813645783516, 6.399275470985418e-05)
LGR :   (0.6941415590626495, 0.00018268181985558552)
