# Imports

In [1]:
import numpy as np
import pandas as pd
import pickle


# ML liberies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.feature_selection import chi2
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv("diabetes_binary.csv")
df.head(5)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [3]:
# Dropping Duplicates
df.drop_duplicates(inplace = True)

df.shape

(229474, 22)

In [4]:
X = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Selection

In [5]:
f_score = chi2(X,y)
f_score

(array([8.09854824e+03, 4.86931274e+03, 4.89041400e+01, 1.55077362e+04,
        2.53826098e+02, 2.15667838e+03, 5.82214570e+03, 6.17563886e+02,
        5.46888971e+01, 8.20988463e+01, 9.37401148e+02, 7.94973139e+00,
        8.36628298e+01, 7.67173283e+03, 1.14195847e+04, 9.79887617e+04,
        7.87549618e+03, 1.37837135e+02, 8.53990634e+03, 4.79112939e+02,
        3.37709926e+03]),
 array([0.00000000e+000, 0.00000000e+000, 2.68782813e-012, 0.00000000e+000,
        3.80487746e-057, 0.00000000e+000, 0.00000000e+000, 2.53282236e-136,
        1.41201767e-013, 1.29448151e-019, 7.26845737e-206, 4.80945144e-003,
        5.86782002e-020, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 7.91057145e-032, 0.00000000e+000, 3.33247161e-106,
        0.00000000e+000]))

In [6]:
p_val = pd.Series(f_score[1],index=X_train.columns)
p_val.sort_values(ascending = True , inplace=True)
p_val

HighBP                   0.000000e+00
Age                      0.000000e+00
DiffWalk                 0.000000e+00
PhysHlth                 0.000000e+00
MentHlth                 0.000000e+00
GenHlth                  0.000000e+00
Income                   0.000000e+00
Stroke                   0.000000e+00
BMI                      0.000000e+00
HighChol                 0.000000e+00
HeartDiseaseorAttack     0.000000e+00
HvyAlcoholConsump       7.268457e-206
PhysActivity            2.532822e-136
Education               3.332472e-106
Smoker                   3.804877e-57
Sex                      7.910571e-32
NoDocbcCost              5.867820e-20
Veggies                  1.294482e-19
Fruits                   1.412018e-13
CholCheck                2.687828e-12
AnyHealthcare            4.809451e-03
dtype: float64

# Model Selection

In [7]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:
accu_list= []

algo = [ LogisticRegression(),
        RandomForestClassifier(max_depth=5,n_estimators=1000, class_weight='balanced'),
        KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2),
        GaussianNB(),
        ExtraTreeClassifier(max_depth=5),
        DecisionTreeClassifier(max_depth=5),
        LGBMClassifier(n_estimators=500),
        XGBClassifier(max_depth=4, n_estimators=500)]

names = ['Logistic Regression','Random Forest Classifier',
        'K Neighbors Classifier','Guassian','ExtraTree','Decision_Tree','Lightgbm', 'Xgboost']



for name in algo:
    model = name
    model.fit(X_train,y_train)
    y_pred = model.predict_proba(X_test)[:,1]
    score=roc_auc_score(y_test,y_pred)
    accu_list.append(score)

In [9]:
evaluation = pd.DataFrame({'Model': names,
                           'Score': accu_list})

evaluation = evaluation.sort_values(by='Score', ascending=False)
evaluation

Unnamed: 0,Model,Score
6,Lightgbm,0.816432
7,Xgboost,0.81372
0,Logistic Regression,0.809601
1,Random Forest Classifier,0.805465
5,Decision_Tree,0.790524
3,Guassian,0.769968
4,ExtraTree,0.715666
2,K Neighbors Classifier,0.711613


In [10]:
for name in algo:
    pred = name.predict(X_test)
    print(name," accuracy is : ",accuracy_score(y_test,pred))
    print(name,"confusion matrix-")
    print(confusion_matrix(y_test,pred))
    print()

LogisticRegression()  accuracy is :  0.8502501350903798
LogisticRegression() confusion matrix-
[[47473  1106]
 [ 7485  1305]]

RandomForestClassifier(class_weight='balanced', max_depth=5, n_estimators=1000)  accuracy is :  0.7034286809949625
RandomForestClassifier(class_weight='balanced', max_depth=5, n_estimators=1000) confusion matrix-
[[33587 14992]
 [ 2022  6768]]

KNeighborsClassifier()  accuracy is :  0.8318429814011051
KNeighborsClassifier() confusion matrix-
[[45900  2679]
 [ 6968  1822]]

GaussianNB()  accuracy is :  0.7555299900643204
GaussianNB() confusion matrix-
[[38268 10311]
 [ 3714  5076]]

ExtraTreeClassifier(max_depth=5)  accuracy is :  0.8467813627568896
ExtraTreeClassifier(max_depth=5) confusion matrix-
[[48579     0]
 [ 8790     0]]

DecisionTreeClassifier(max_depth=5)  accuracy is :  0.851104254911189
DecisionTreeClassifier(max_depth=5) confusion matrix-
[[47988   591]
 [ 7951   839]]

LGBMClassifier(n_estimators=500)  accuracy is :  0.8527427704858024
LGBMClassif

**lightgbm** has the roc score .81 and accuracy score .85 so choosing lightgbm as the most accurate classifire to pickle.