In [47]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import joblib
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, precision_score
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

In [3]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
X = df.drop('Outcome',axis=1).copy()
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [5]:
y = df['Outcome'].copy()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

## LightGBM

In [36]:
d_train=lgb.Dataset(X_train, label=y_train)

In [37]:
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='binary' #Binary target feature
params['metric']='auc' #metric for binary classification
params['max_depth']=10
params['num_leaves']=31
params['min_data_in_leaf']=20
params['max_depth']=-1

In [59]:
clf=lgb.train(params,d_train,100) #train the model on 100 epocs

[LightGBM] [Info] Number of positive: 217, number of negative: 397
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 663
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.353420 -> initscore=-0.604039
[LightGBM] [Info] Start training from score -0.604039


In [39]:
y_pred=clf.predict(X_test).round(0).astype(int)

In [40]:
y_pred

array([1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0])

In [41]:
precision_score(y_pred,y_test,average=None).mean()

0.7361507709880069

In [42]:
confusion_mat = confusion_matrix(y_test, y_pred, labels=[1,0])
confusion_mat

array([[33, 18],
       [18, 85]], dtype=int64)

In [43]:
TP = confusion_mat[0,0]; TN=confusion_mat[1,1]; FN = confusion_mat[1,0]; FP = confusion_mat[0,1]

In [44]:
sensitivity = TP/(TP+FN)
print(f"{round(sensitivity*100,2)}% of the cases where the patient was diabetic and was correctly identified as diabetic")

64.71% of the cases where the patient was diabetic and was correctly identified as diabetic


In [45]:
specificity = TN/(TN+FP)
print(f"{round(specificity*100,2)}% of the cases without diabetes that were correctly identified")

82.52% of the cases without diabetes that were correctly identified


## Tuning hyper-parameters

In [51]:
params

{'learning_rate': 0.03,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'max_depth': -1,
 'num_leaves': 31,
 'min_data_in_leaf': 20}

In [122]:
tune_params = {
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'max_depth': [-1,1,3],
    'num_leaves': [300, 400, 500, 800],
    'min_data_in_leaf': [3,4, 5]
}
tune_params

{'learning_rate': [0.001, 0.01, 0.05, 0.1],
 'max_depth': [-1, 1, 3],
 'num_leaves': [300, 400, 500, 800],
 'min_data_in_leaf': [3, 4, 5]}

In [123]:
{'num_leaves': 400, 'min_data_in_leaf': 4, 'max_depth': -1, 'learning_rate': 0.05}

{'num_leaves': 400,
 'min_data_in_leaf': 4,
 'max_depth': -1,
 'learning_rate': 0.05}

In [124]:
clf2 = lgb.LGBMClassifier(max_depth=-1, verbose=-1, silent = True, n_jobs=4, n_estimators= 5000)

In [137]:
gs = RandomizedSearchCV(estimator=clf2, param_distributions= tune_params, n_iter=10, scoring='accuracy', cv=4, verbose=True) 

In [138]:
gs.fit(X_train, y_train)
print(f"Best score {gs.best_score_} params: {gs.best_params_}")

Fitting 4 folds for each of 10 candidates, totalling 40 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  3.0min finished
Best score 0.7377238774297599 params: {'num_leaves': 400, 'min_data_in_leaf': 3, 'max_depth': -1, 'learning_rate': 0.1}


In [139]:
final_y_pred = gs.predict(X_test)

In [140]:
precision_score(final_y_pred,y_test,average=None).mean()

0.7213972967827907

In [141]:
confusion_mat2 = confusion_matrix(y_test, final_y_pred, labels=[1,0])
confusion_mat2

array([[31, 20],
       [17, 86]], dtype=int64)

In [142]:
TP = confusion_mat2[0,0]; TN=confusion_mat2[1,1]; FN = confusion_mat2[1,0]; FP = confusion_mat2[0,1]
sensitivity = TP/(TP+FN)
print(f"{round(sensitivity*100,2)}% of the cases where the patient was diabetic and was correctly identified as diabetic")
specificity = TN/(TN+FP)
print(f"{round(specificity*100,2)}% of the cases without diabetes that were correctly identified")

64.58% of the cases where the patient was diabetic and was correctly identified as diabetic
81.13% of the cases without diabetes that were correctly identified
