In [1]:
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Model selection
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
X_train = pd.read_csv("data/clean-data/Xtrain-clean-autism-screening.csv", index_col=0)
y_train = pd.read_csv("data/clean-data/ytrain-clean-autism-screening.csv", index_col=0)
X_test = pd.read_csv("data/clean-data/Xtest-clean-autism-screening.csv", index_col=0)
y_test = pd.read_csv("data/clean-data/ytest-clean-autism-screening.csv", index_col=0)


In [3]:
numeric_features = ["age", 
                    "result"]

one_hot_features = ["gender", 
                    "ethnicity", 
                    "jaundice", 
                    "country_of_res", 
                    "used_app_before", 
                    "age_desc", 
                    "relation",
                    "Class/ASD"]

other_columns = list(X_train.columns[0:10])




In [4]:
preprocessor = ColumnTransformer(sparse_threshold=0,
    transformers=[
        ("scale", 
         StandardScaler(), 
         numeric_features),
        ("one_hot", 
         OneHotEncoder(drop=None, 
                       handle_unknown="ignore"), 
         one_hot_features)
    ])

In [5]:
X_train_temp = pd.DataFrame(preprocessor.fit_transform(X_train), 
            index = X_train.index,
            columns = (numeric_features + 
                       list(preprocessor
                           .named_transformers_["one_hot"]
                           .get_feature_names(one_hot_features)))
                      )

    
X_test_temp = pd.DataFrame(preprocessor.transform(X_test),
                     index = X_test.index,
                     columns = X_train_temp.columns)

X_train = X_train_temp.join(X_train[other_columns])
X_test = X_test_temp.join(X_test[other_columns])

In [6]:
le = LabelEncoder()

y_train = le.fit_transform(y_train.to_numpy().ravel())
y_test = le.transform(y_test.to_numpy().ravel())

In [25]:
lr = LogisticRegression()
print("LR score on train set:", lr.fit(X_train, y_train).score(X_train, y_train))
print("LR score on test set:", lr.score(X_test, y_test))

dt = DecisionTreeClassifier()
print("DT score on train set:", dt.fit(X_train, y_train).score(X_train, y_train))
print("DT score on test set:", dt.score(X_test, y_test))

rf = RandomForestClassifier()
print("RF score on train set:", rf.fit(X_train, y_train).score(X_train, y_train))
print("RF score on test set:", rf.score(X_test, y_test))

svm = SVC()
print("SVM score on train set:", svm.fit(X_train, y_train).score(X_train, y_train))
print("SVM score on test set:", svm.score(X_test, y_test))

knn = KNeighborsClassifier()
print("KNN score on train set:", knn.fit(X_train, y_train).score(X_train, y_train))
print("KNN score on test set:", knn.score(X_test, y_test))

LR score on train set: 0.8497191011235955
LR score on test set: 0.8539325842696629
DT score on train set: 1.0
DT score on test set: 0.7415730337078652
RF score on train set: 1.0
RF score on test set: 0.848314606741573
SVM score on train set: 0.8455056179775281
SVM score on test set: 0.8426966292134831
KNN score on train set: 0.8581460674157303
KNN score on test set: 0.8314606741573034


In [45]:
## Pipeline

estimators = [lr, dt, rf, svm, knn]

# pipe_steps = [('poly', PolynomialFeatures(...)),
#               ('lr', LogisticRegression(...)),
#               ('dt', DecisionTreeClassifier(...)),
#               ('rf', RandomForestClassifier(...)),
#               ('svm', SVC(...)),
#               ('knn', KNeighborsClassifier(...))]
             
# pipe = Pipeline(steps = pipe_steps)

# 'linear' kernel takes an insanely long time??
# https://datascience.stackexchange.com/questions/989/svm-using-scikit-learn-runs-endlessly-and-never-completes-execution

params = [{'C':[0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
         'max_iter': [50, 100, 200, 250, 300]},
         {'max_depth': [1, 5, 10, 15, 20, 25, None],
         'max_features': [3, 5, 10, 20, 25, 50, 100, None]},
         {'min_impurity_decrease': [0, 0.25, 0.5],
         'max_features': [3, 5, 10, 20, 50, 100, 'auto']},
         {'C':[0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
         'gamma':[0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
         {'n_neighbors': [2, 5, 10, 15, 20, 50, 100],
         'algorithm': ['auto', 'brute']}]

for i in range(len(estimators)):
    search = GridSearchCV(estimator=estimators[i], 
                          param_grid=params[i],
                          cv = 10,
                          n_jobs=-1)
    
    search_object = search.fit(X_train, y_train)
    
    print(search_object.best_estimator_)
    print(search_object.best_params_)
    print(search_object.best_score_, "\n\n")

LogisticRegression(C=1e-06, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=50,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
{'C': 1e-06, 'max_iter': 50}
0.8455007824726135 


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=1, max_features=3, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
{'max_depth': 1, 'max_features': 3}
0.8455007824726135 


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=

In [47]:
rf = RandomForestClassifier(max_features=3, min_impurity_decrease=0)
rf.fit(X_train, y_train).score(X_train, y_train)


1.0

In [48]:
rf.score(X_test, y_test)

0.8314606741573034