Necessary Imports + Util func definition

In [11]:
import json, os

import joblib
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC     # try rbf
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, \
    train_test_split, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, \
    f1_score

def get_name(estimator):
    name = estimator.__class__.__name__
    if name == "Pipeline":
        name = [get_name(est[1]) for est in estimator.steps]
        name = " + ".join(name)
    return name

Data & Preprocessing

In [19]:
data_diabetes = pd.read_csv('../data/diabetes.csv')

features = data_diabetes.drop(columns='Outcome')
label = data_diabetes['Outcome']

for col in features.columns:
    Q1 = features[col].quantile(0.25)
    Q3 = features[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    features[col] = features[col].clip(lower=lower, upper=upper)

X_train, X_test, y_train, y_test = train_test_split(
    features, label, test_size = .1, random_state=0
)

Models + Hyperparam Grids

In [20]:
estimators = [
    (   # logreg
        make_pipeline(StandardScaler(), LogisticRegression(random_state=0)),
        {'logisticregression__C': np.logspace(-1,1,3)},
    ),
    (   # knn
        make_pipeline(StandardScaler(), KNeighborsClassifier()),
        {'kneighborsclassifier__n_neighbors':np.arange(5, 15, 1)},
    ),
    (   # forest
        make_pipeline(
            StandardScaler(), 
            RandomForestClassifier(random_state=0)
        ),
        {
            'randomforestclassifier__n_estimators': [10,20,50,100,150,200],
            'randomforestclassifier__min_samples_split': [2,3,5],
        },
    ),
    (   # xgboost
        make_pipeline(
            StandardScaler(), 
            GradientBoostingClassifier(random_state=0)
        ),
        {
            'gradientboostingclassifier__n_estimators': [50,100,200,400],
            'gradientboostingclassifier__min_samples_split': [2,3,5],
            'gradientboostingclassifier__subsample': [.2,.5,1.],
            'gradientboostingclassifier__max_depth': [3,5,7,10]
        },
    ),
    (   # svm
        make_pipeline(StandardScaler(), SVC(random_state=0)),
        {
            'svc__C': [2, 1, 0.5, 0.2],
            'svc__kernel': ['linear', 'rbf']
        },
    ),
    (   # shallow nn
        make_pipeline(
            StandardScaler(), 
            MLPClassifier(
                hidden_layer_sizes=[16,16],
                learning_rate_init=0.1, 
                learning_rate='adaptive',
                max_iter=2000,
                random_state=0
            )
        ),
        {
            'mlpclassifier__alpha': [.001, .1, 1, 2, 4],
            'mlpclassifier__batch_size': [8, 16, 32, 64, 128],   
        },
    ),
]

names = [get_name(e).replace("StandardScaler + ", "") for e, _ in estimators]
names

['LogisticRegression',
 'KNeighborsClassifier',
 'RandomForestClassifier',
 'GradientBoostingClassifier',
 'SVC',
 'MLPClassifier']

Training

In [21]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)
best_estimators = []

for name, (estimator, param_gird) in zip(names, estimators):
    grid = GridSearchCV(estimator=estimator, param_grid=param_gird, cv=cv)
    grid.fit(X_train, y_train)
    
    best_estimators.append(grid.best_estimator_)
    print(name +' done')
    

LogisticRegression done
KNeighborsClassifier done
RandomForestClassifier done
GradientBoostingClassifier done
SVC done
MLPClassifier done


Save models + cross-val scores + test performance

**Important:** The cell below is for rerunning only, be careful not to overwrite the models themselves. Otherwise, that'll cost you 9 mins of ur life.

In [None]:
# Load trained models. Used only when re-running the notebook
best_estimators = [joblib.load(f'../ml/models/{saved_model}') for saved_model in os.listdir('../ml/models')]
best_estimators
# rerun the cell defining the names list

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=200, random_state=0)
est_cls = {
    'LogisticRegression': LogisticRegression,
    'KNeighborsClassifier': KNeighborsClassifier,
    'RandomForestClassifier': RandomForestClassifier,
    'GradientBoostingClassifier': GradientBoostingClassifier,
    'SVC': SVC,
    'MLPClassifier': MLPClassifier,
}
cv_scores = {}
test_scores = {}

assert list(est_cls.keys()) == names

for name, estimator in zip(names, best_estimators):
    hyperparams = estimator.named_steps[name.lower()].get_params()
    base = make_pipeline(
        StandardScaler(),
        est_cls[name](**hyperparams),
    )
    cv_scores[name] = cross_val_score(
        base, X_train, y_train, cv=cv, scoring='accuracy'
    )

    y_pred = estimator.predict(X_test)
    test_scores[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
    }

    print(name +' done')

df_cv_scores = pd.DataFrame(cv_scores)
df_cv_scores.to_csv('../models/kfoldcv_scores.csv', index=False)

df_test_scores = pd.DataFrame.from_dict(test_scores, orient='index')
df_test_scores.to_csv('../models/test_performance.csv', index_label='Model')

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=200, random_state=0)
est_cls = {
    'LogisticRegression': LogisticRegression,
    'KNeighborsClassifier': KNeighborsClassifier,
    'RandomForestClassifier': RandomForestClassifier,
    'GradientBoostingClassifier': GradientBoostingClassifier,
    'SVC': SVC,
    'MLPClassifier': MLPClassifier,
}
cv_scores = {}
test_scores = {}

assert list(est_cls.keys()) == names

for name, estimator in zip(names, best_estimators):
    joblib.dump(estimator, f'../ml/models/{name}.pkl')  

    hyperparams = estimator.named_steps[name.lower()].get_params()
    base = make_pipeline(
        StandardScaler(),
        est_cls[name](**hyperparams),
    )
    cv_scores[name] = cross_val_score(
        base, X_train, y_train, cv=cv, scoring='accuracy'
    )

    y_pred = estimator.predict(X_test)
    test_scores[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
    }

    print(name +' done')


In [28]:
df_cv_scores = pd.DataFrame(cv_scores)
df_cv_scores.to_csv('../ml/scores/kfoldcv_scores.csv', index=False)

df_test_scores = pd.DataFrame.from_dict(test_scores, orient='index')
df_test_scores.to_csv('../ml/scores/test_performance.csv', index_label='Model')

In [28]:
scores = [{} for _ in range(10)]
for i in range(len(scores)):
    print(f'----------- Round {i+1} ------------')
    
    X_train, X_test, y_train, y_test = train_test_split(
        features, label, test_size=0.1, stratify=label, random_state=i
    )
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=42)

    for name, (estimator, param_gird) in zip(names, estimators):
        grid = GridSearchCV(estimator=estimator, param_grid=param_gird, cv=cv)
        grid.fit(X_train, y_train)

        y_pred = grid.best_estimator_.predict(X_test)
        scores[i][name] = {
            'Accuracy': accuracy_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'F1-Score': f1_score(y_test, y_pred),
        }
        print(name +' done')


----------- Round 1 ------------
LogisticRegression done
KNeighborsClassifier done
RandomForestClassifier done
GradientBoostingClassifier done
SVC done
MLPClassifier done
----------- Round 2 ------------
LogisticRegression done
KNeighborsClassifier done
RandomForestClassifier done
GradientBoostingClassifier done
SVC done
MLPClassifier done
----------- Round 3 ------------
LogisticRegression done
KNeighborsClassifier done
RandomForestClassifier done
GradientBoostingClassifier done
SVC done
MLPClassifier done
----------- Round 4 ------------
LogisticRegression done
KNeighborsClassifier done
RandomForestClassifier done
GradientBoostingClassifier done
SVC done
MLPClassifier done
----------- Round 5 ------------
LogisticRegression done
KNeighborsClassifier done
RandomForestClassifier done
GradientBoostingClassifier done
SVC done
MLPClassifier done
----------- Round 6 ------------
LogisticRegression done
KNeighborsClassifier done
RandomForestClassifier done
GradientBoostingClassifier done
SV

In [29]:
for seed, iter_scores in enumerate(scores):
    df = pd.DataFrame.from_dict(iter_scores, orient='index')
    print('Seed', seed)
    print('-'*50)
    print(df)

with open('../misc/final_test_scores.json', 'w') as file:
    json.dump(scores, file, indent=4)

Seed 0
--------------------------------------------------
                            Accuracy    Recall  Precision  F1-Score
LogisticRegression          0.818182  0.629630   0.809524  0.708333
KNeighborsClassifier        0.779221  0.555556   0.750000  0.638298
RandomForestClassifier      0.857143  0.666667   0.900000  0.765957
GradientBoostingClassifier  0.870130  0.814815   0.814815  0.814815
SVC                         0.818182  0.629630   0.809524  0.708333
MLPClassifier               0.805195  0.740741   0.714286  0.727273
Seed 1
--------------------------------------------------
                            Accuracy    Recall  Precision  F1-Score
LogisticRegression          0.688312  0.407407   0.578947  0.478261
KNeighborsClassifier        0.701299  0.481481   0.590909  0.530612
RandomForestClassifier      0.701299  0.444444   0.600000  0.510638
GradientBoostingClassifier  0.675325  0.481481   0.541667  0.509804
SVC                         0.675325  0.370370   0.555556  0.444444
