In [1]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import joblib
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, precision_score
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from evolutionary_search import EvolutionaryAlgorithmSearchCV

In [2]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Feature selection has selected 5 features

In [3]:
X = df.drop('Outcome',axis=1).copy()
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [4]:
y = df['Outcome'].copy()

In [5]:
voting_classifier = joblib.load('voting_classifier')

In [6]:
voting_classifier.predict(X.iloc[1].values.reshape(1,-1))

array([0], dtype=int64)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [8]:
voting_classifier.score(X_test, y_test)

0.8181818181818182

## LightGBM

In [9]:
d_train=lgb.Dataset(X_train, label=y_train)

In [10]:
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='binary' #Binary target feature
params['metric']='binary_logloss' #metric for binary classification
params['max_depth']=10

In [11]:
clf=lgb.train(params,d_train,100) #train the model on 100 epocs

[LightGBM] [Info] Number of positive: 205, number of negative: 409
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333876 -> initscore=-0.690705
[LightGBM] [Info] Start training from score -0.690705


In [12]:
y_pred=clf.predict(X_test).round(0).astype(int)

In [13]:
y_pred

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0])

In [14]:
precision_score(y_pred,y_test,average=None).mean()

0.7356532356532357

In [15]:
confusion_mat = confusion_matrix(y_test, y_pred, labels=[1,0])
confusion_mat

array([[38, 25],
       [12, 79]], dtype=int64)

In [16]:
TP = confusion_mat[0,0]; TN=confusion_mat[1,1]; FN = confusion_mat[1,0]; FP = confusion_mat[0,1]

In [17]:
sensitivity = TP/(TP+FN)
print(f"{round(sensitivity*100,2)}% of the cases where the patient was diabetic and was correctly identified as diabetic")

76.0% of the cases where the patient was diabetic and was correctly identified as diabetic


In [18]:
specificity = TN/(TN+FP)
print(f"{round(specificity*100,2)}% of the cases without diabetes that were correctly identified")

75.96% of the cases without diabetes that were correctly identified


## Using evolutionary algorithm to optimise hyper-parameters

Previouslt used parameters:

In [19]:
params

{'learning_rate': 0.03,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'binary_logloss',
 'max_depth': 10}

New parameters to optimise

In [None]:
paramgrid = {'boosting': ['gbdt', 'dart', 'goss', 'rf'],
             'num_leaves': np.linspace(1, 100, num=5),
             'learning_rate': np.linspace(.0001, .1, num=3),
             'n_estimators': [50, 100, 200],
             'objective': ['binary'],
             'is_unbalance': [True],
             'min_data_in_leaf': np.linspace(20, 30, num=5).astype(int),
             'metric': ['auc'],
             'bagging_fraction': np.linspace(0, 0.9, num=5),
             'bagging_freq': [2],  
             'silent': [True],    
             'zero_as_missing': [True, False],} 

In [21]:
cv = EvolutionaryAlgorithmSearchCV(estimator=lgb.LGBMClassifier(), params=paramgrid,
                                   scoring="accuracy",
                                   cv=StratifiedKFold(n_splits=4),
                                   verbose=2,
                                   population_size=10,
                                   gene_mutation_prob=0.10,
                                   gene_crossover_prob=0.5,
                                   tournament_size=3,
                                   generations_number=5,
                                   n_jobs=4)

In [None]:
# cv.fit(X,y)