In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [2]:
df = pd.read_csv("pre_processed_dataset.csv")
print("Dataframe shape: {}".format(df.shape))
df.head()

Dataframe shape: (47971, 52)


Unnamed: 0,person_id,status,Creatine kinase [Enzymatic activity/volume] in Serum or Plasma_high,Kappa light chains/Lambda light chains [Mass Ratio] in Serum by Immunoassay_date,Systolic blood pressure_value,Systolic blood pressure_high,Calcidiol [Mass/volume] in Serum or Plasma_value,Erythrocyte sedimentation rate by Westergren method_value,Prostate specific Ag [Mass/volume] in Serum or Plasma_date,Testosterone Free [Mass/volume] in Serum or Plasma_value,...,"Glomerular filtration rate/1.73 sq M.predicted among non-blacks [Volume Rate/Area] in Serum, Plasma or Blood by Creatinine-based formula (MDRD)_date","Glomerular filtration rate/1.73 sq M.predicted among non-blacks [Volume Rate/Area] in Serum, Plasma or Blood by Creatinine-based formula (MDRD)_value",Base deficit in Arterial blood_date,Creatinine [Mass/volume] in Blood_value,Phosphate [Mass/volume] in Serum or Plasma_value,Oxygen content in Arterial blood_value,Lactate [Moles/volume] in Venous blood_value,Lactate [Moles/volume] in Venous blood_high,Hematocrit [Volume Fraction] of Blood by Automated count_value,Hematocrit [Volume Fraction] of Blood by Automated count_high
0,0,0.0,0.843077,2424,0.835366,0.0,0.664,0.904762,2464,0.6,...,2312,1.0,852,0.0,0.491803,0.497561,0.0,0.0,0.0,0.0
1,1,0.0,1.0,3290,0.847561,0.75,0.656,0.0,2596,0.082353,...,2153,1.0,2052,0.0,0.836066,0.0,0.0,0.0,0.541667,0.9
2,2,0.0,1.0,3323,0.0,0.0,0.1536,0.035714,2461,0.611765,...,2739,1.0,0,0.59375,0.704918,0.902439,0.185185,1.0,0.0,0.0
3,3,0.0,1.0,3284,0.890244,0.985714,0.5584,0.27381,1804,0.0,...,0,0.0,1664,0.28125,0.491803,0.678049,0.259259,0.76,0.0,0.0
4,4,0.0,1.0,3004,0.676829,0.807143,0.1248,0.0,1853,0.176471,...,2813,0.916667,0,0.21875,0.0,0.0,0.074074,1.0,0.0,0.0


In [9]:
classifiers = [
    KNeighborsClassifier(3),
    #SVC(kernel="linear", C=0.025),
    #SVC(gamma=2, C=1),
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]
"""names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]"""
names = ["Nearest Neighbors",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

In [10]:
features = df.columns.tolist()[2:]
outcome = 'status'

In [11]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, shuffle=True, test_size=0.3, stratify=df[outcome])

In [12]:
df[outcome] = df[outcome].astype(int)

In [13]:
from sklearn.metrics import precision_recall_fscore_support
scores = {}
for name, clf in zip(names, classifiers):
    print("Training " + name)
    clf.fit(train[features], train[outcome])
    train_score =  clf.score(train[features], train[outcome])
    train_precision_recall_metrics = precision_recall_fscore_support(train[outcome], clf.predict(train[features]), average='binary')
    test_score = clf.score(test[features], test[outcome])
    test_precision_recall_metrics = precision_recall_fscore_support(test[outcome], clf.predict(test[features]), average='binary')
    scores[name] = {'test accuracy' : test_score, 'test precision' : test_precision_recall_metrics[0], 'test recall': test_precision_recall_metrics[1], 'train accuracy' : train_score, 'train precision' : train_precision_recall_metrics[0], 'train recall' : train_precision_recall_metrics[1]}
    print("Test Accuracy is {}\t Test Precision is {}\t Test Recall is {}".format(test_score, test_precision_recall_metrics[0], test_precision_recall_metrics[1]))
    print("Train Accuracy is {}\t Train Precision is {}\t Train Recall is {}".format(train_score, train_precision_recall_metrics[0], train_precision_recall_metrics[1]))
    print("-----------")

Training Nearest Neighbors
Test Accuracy is 0.876876042245692	 Test Precision is 0.06699751861042183	 Test Recall is 0.018973998594518624
Train Accuracy is 0.9105988862086423	 Train Precision is 0.6751381215469613	 Train Recall is 0.18409159385357035
-----------
Training Decision Tree
Test Accuracy is 0.9004307948860478	 Test Precision is 0.0	 Test Recall is 0.0
Train Accuracy is 0.9017838530033652	 Train Precision is 0.7837837837837838	 Train Recall is 0.008737571557698103
-----------
Training Random Forest


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test Accuracy is 0.9011256253474152	 Test Precision is 0.0	 Test Recall is 0.0
Train Accuracy is 0.9011584621340719	 Train Precision is 0.0	 Train Recall is 0.0
-----------
Training Neural Net
Test Accuracy is 0.5116731517509727	 Test Precision is 0.10655622630913941	 Test Recall is 0.533380182712579
Train Accuracy is 0.5129694154084398	 Train Precision is 0.10716653607377494	 Train Recall is 0.5357035251581802
-----------
Training AdaBoost


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test Accuracy is 0.9011256253474152	 Test Precision is 0.0	 Test Recall is 0.0
Train Accuracy is 0.9011584621340719	 Train Precision is 0.0	 Train Recall is 0.0
-----------
Training Naive Bayes
Test Accuracy is 0.9010561423012785	 Test Precision is 0.0	 Test Recall is 0.0
Train Accuracy is 0.9011584621340719	 Train Precision is 0.0	 Train Recall is 0.0
-----------
Training QDA
Test Accuracy is 0.8978599221789884	 Test Precision is 0.14925373134328357	 Test Recall is 0.007027406886858749
Train Accuracy is 0.8989547038327527	 Train Precision is 0.30526315789473685	 Train Recall is 0.017475143115396205
-----------
