In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv('diabetes.csv')

In [5]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
data.shape

(768, 9)

In [8]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [9]:
data.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [10]:
#convert both the columns in one-hot encoding

data['Glucose'] = np.where(data['Glucose'] == 0, data['Glucose'].median(), data['Glucose'])
data['Insulin'] = np.where(data['Insulin'] == 0, data['Insulin'].median(), data['Insulin'])

In [11]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35,30.5,33.6,0.627,50,1
1,1,85.0,66,29,30.5,26.6,0.351,31,0
2,8,183.0,64,0,30.5,23.3,0.672,32,1
3,1,89.0,66,23,94.0,28.1,0.167,21,0
4,0,137.0,40,35,168.0,43.1,2.288,33,1


In [12]:
data.dtypes

Pregnancies                   int64
Glucose                     float64
BloodPressure                 int64
SkinThickness                 int64
Insulin                     float64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [13]:
X = data.iloc[:, :-1]
y = data.iloc[:,-1]

In [14]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35,30.5,33.6,0.627,50
1,1,85.0,66,29,30.5,26.6,0.351,31
2,8,183.0,64,0,30.5,23.3,0.672,32
3,1,89.0,66,23,94.0,28.1,0.167,21
4,0,137.0,40,35,168.0,43.1,2.288,33


In [15]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state = 42)

In [17]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

In [18]:
rfc

RandomForestClassifier(n_estimators=10)

In [19]:
prediction = rfc.predict(X_test)

In [20]:
prediction

array([1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
      dtype=int64)

In [21]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [22]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print("CONFUSION MATRIX : {}".format(confusion_matrix(y_test, prediction)))
print("CLASSIFICATION REPORT : {}".format(classification_report(y_test, prediction)))
print("ACCURACY SCORE : {}".format(accuracy_score(y_test, prediction)))

CONFUSION MATRIX : [[81 18]
 [25 30]]
CLASSIFICATION REPORT :               precision    recall  f1-score   support

           0       0.76      0.82      0.79        99
           1       0.62      0.55      0.58        55

    accuracy                           0.72       154
   macro avg       0.69      0.68      0.69       154
weighted avg       0.71      0.72      0.72       154

ACCURACY SCORE : 0.7207792207792207


In [23]:
model = RandomForestClassifier(n_estimators=500, criterion = 'gini',
                              max_features = 'sqrt', min_samples_leaf=10, random_state = 100).fit(X_train, y_train)

In [24]:
prediction = model.predict(X_test)

In [25]:
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))
print(accuracy_score(y_test, prediction))

[[83 16]
 [21 34]]
              precision    recall  f1-score   support

           0       0.80      0.84      0.82        99
           1       0.68      0.62      0.65        55

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.73       154
weighted avg       0.76      0.76      0.76       154

0.7597402597402597


In [26]:
from sklearn.model_selection import RandomizedSearchCV

#no. of trees in random forest

n_estimators = [int(x) for x in np.linspace(start=200, stop = 2000, num = 10)]

# no. of features at every split

max_features = ['auto', 'sqrt', 'log2']

#max no. of levels in tree

max_depth = [int(x) for x in np.linspace(10, 1000, 10)]

#min no. of samples required to split the node 

min_samples_split = [1,2,3,4,5,7,9]

#min no. of samples required at each leaf node

min_samples_leaf = [1,2,4,6,8]

#create the random grid

random_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'min_samples_leaf' : min_samples_leaf,
    'criterion' : ['entropy', 'gini']
    
}

In [27]:
rf = RandomForestClassifier()
rf_cv = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                          n_iter = 100, cv=3, verbose = 2, random_state = 100, n_jobs=-1)

In [28]:
rf_cv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [1, 2, 3, 4, 5, 7,
                                                              9],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   

In [29]:
rf_cv.best_params_

{'n_estimators': 1600,
 'min_samples_split': 7,
 'min_samples_leaf': 6,
 'max_features': 'log2',
 'max_depth': 1000,
 'criterion': 'gini'}

In [30]:
rf_cv_best_params = rf_cv.best_estimator_

In [31]:
y_pred = rf_cv_best_params.predict(X_test)

In [32]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[81 18]
 [17 38]]
              precision    recall  f1-score   support

           0       0.83      0.82      0.82        99
           1       0.68      0.69      0.68        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154

0.7727272727272727


In [33]:
rf_cv_best_params

RandomForestClassifier(max_depth=1000, max_features='log2', min_samples_leaf=6,
                       min_samples_split=7, n_estimators=1600)

In [34]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rf_cv.best_params_['criterion']],
    'max_depth': [rf_cv.best_params_['max_depth']],
    'max_features': [rf_cv.best_params_['max_features']],
    'min_samples_leaf': [rf_cv.best_params_['min_samples_leaf'],
                         rf_cv.best_params_['min_samples_leaf'] + 2,
                         rf_cv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_cv.best_params_['min_samples_split'] - 2,
                          rf_cv.best_params_['min_samples_split'] - 1,
                          rf_cv.best_params_['min_samples_split'],
                          rf_cv.best_params_['min_samples_split'] + 1,
                          rf_cv.best_params_['min_samples_leaf'] + 2],
    'n_estimators': [rf_cv.best_params_['n_estimators'] - 200,
                    rf_cv.best_params_['n_estimators'] - 100,
                    rf_cv.best_params_['n_estimators'],
                    rf_cv.best_params_['n_estimators'] + 100,
                    rf_cv.best_params_['n_estimators'] + 200,
                    rf_cv.best_params_['n_estimators'] - 600,]
}

print(param_grid)

{'criterion': ['gini'], 'max_depth': [1000], 'max_features': ['log2'], 'min_samples_leaf': [6, 8, 10], 'min_samples_split': [5, 6, 7, 8, 8], 'n_estimators': [1400, 1500, 1600, 1700, 1800, 1000]}


In [36]:
rf =  RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 90 candidates, totalling 900 fits


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [1000],
                         'max_features': ['log2'],
                         'min_samples_leaf': [6, 8, 10],
                         'min_samples_split': [5, 6, 7, 8, 8],
                         'n_estimators': [1400, 1500, 1600, 1700, 1800, 1000]},
             verbose=2)

In [37]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=1000, max_features='log2', min_samples_leaf=8,
                       min_samples_split=6, n_estimators=1700)

In [38]:
best_grid = grid_search.best_estimator_

In [39]:
y_pred = best_grid.predict(X_test)
print("<-------------------Confusion metrics results is ------------->\n : {}".format(confusion_matrix(y_test, y_pred)))
print("<------------------Classification report is---------------> \n: {}".format(classification_report(y_test, y_pred)))
print("<------------------ Accuracy score----------------> : {}".format(accuracy_score(y_test, y_pred)))

<-------------------Confusion metrics results is ------------->
 : [[81 18]
 [17 38]]
<------------------Classification report is---------------> 
:               precision    recall  f1-score   support

           0       0.83      0.82      0.82        99
           1       0.68      0.69      0.68        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154

<------------------ Accuracy score----------------> : 0.7727272727272727
