In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
df = pd.read_csv('C:\diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
import numpy as np
df['Glucose'] = np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin'] = np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [4]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [5]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33


In [6]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=0)

In [8]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
prediction = classifier.predict(X_test)

In [9]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[92 15]
 [27 20]]
0.7272727272727273
              precision    recall  f1-score   support

           0       0.77      0.86      0.81       107
           1       0.57      0.43      0.49        47

    accuracy                           0.73       154
   macro avg       0.67      0.64      0.65       154
weighted avg       0.71      0.73      0.71       154



In [10]:
### Manual Hyperparameter Tuning
model=RandomForestClassifier(n_estimators=300,criterion='gini',
                             max_features='sqrt',min_samples_leaf=10,random_state=100).fit(X_train,y_train)
predictions=model.predict(X_test)
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[92 15]
 [27 20]]
0.7272727272727273
              precision    recall  f1-score   support

           0       0.77      0.86      0.81       107
           1       0.57      0.43      0.49        47

    accuracy                           0.73       154
   macro avg       0.67      0.64      0.65       154
weighted avg       0.71      0.73      0.71       154



In [11]:
##Randomized search CV

In [12]:
import numpy as n
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ["auto", "sqrt", "log2"]
max_depth = [int(x) for x in np.linspace(100,1000,10)]
min_samples_split = [1,3,4,5,7,9]
min_samples_leaf = [1,2,4,6,8]
random_grid = {'n_estimators' : n_estimators,
               'max_features' : max_features,
               'max_depth' : max_depth,
               'min_samples_split' : min_samples_split,
               'min_samples_leaf' : min_samples_leaf,
               'criterion' : ['entropy', 'gini']
}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'min_samples_split': [1, 3, 4, 5, 7, 9], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [14]:
rf = RandomForestClassifier()
rf_randomCV = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=100,n_jobs=-1)
rf_randomCV.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.6min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [100, 200, 300, 400, 500,
                                                      600, 700, 800, 900,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [1, 3, 4, 5, 7, 9],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [15]:
rf_randomCV.best_params_

{'n_estimators': 1600,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 500,
 'criterion': 'entropy'}

In [17]:
best_random_grid = rf_randomCV.best_estimator_

In [20]:
rf_randomCV.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=500, max_features='log2',
                       min_samples_split=4, n_estimators=1600)

In [19]:
from sklearn.metrics import accuracy_score
y_pred = best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[94 13]
 [13 34]]
Accuracy Score 0.8311688311688312
Classification report:               precision    recall  f1-score   support

           0       0.88      0.88      0.88       107
           1       0.72      0.72      0.72        47

    accuracy                           0.83       154
   macro avg       0.80      0.80      0.80       154
weighted avg       0.83      0.83      0.83       154



In [21]:
## GridSearchCV

In [22]:
rf_randomCV.best_params_

{'n_estimators': 1600,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 500,
 'criterion': 'entropy'}

In [23]:
from sklearn.model_selection import GridSearchCV

In [28]:


param_grid = {
'criterion': [rf_randomCV.best_params_['criterion']],
    'max_depth': [rf_randomCV.best_params_['max_depth']],
    'max_features': [rf_randomCV.best_params_['max_features']],
    'min_samples_leaf': [rf_randomCV.best_params_['min_samples_leaf'], 
                         rf_randomCV.best_params_['min_samples_leaf']+2, 
                         rf_randomCV.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_randomCV.best_params_['min_samples_split'] - 2,
                          rf_randomCV.best_params_['min_samples_split'] - 1,
                          rf_randomCV.best_params_['min_samples_split'], 
                          rf_randomCV.best_params_['min_samples_split'] +1,
                          rf_randomCV.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_randomCV.best_params_['n_estimators'] - 200, rf_randomCV.best_params_['n_estimators'] - 100, 
                     rf_randomCV.best_params_['n_estimators'], 
                     rf_randomCV.best_params_['n_estimators'] + 100, rf_randomCV.best_params_['n_estimators'] + 200]
}
print(param_grid)


{'criterion': ['entropy'], 'max_depth': [500], 'max_features': ['log2'], 'min_samples_leaf': [1, 3, 5], 'min_samples_split': [2, 3, 4, 5, 6], 'n_estimators': [1400, 1500, 1600, 1700, 1800]}


In [29]:

#### Fit the grid_search to the data
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed: 19.1min finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy'], 'max_depth': [500],
                         'max_features': ['log2'],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [2, 3, 4, 5, 6],
                         'n_estimators': [1400, 1500, 1600, 1700, 1800]},
             verbose=2)

In [30]:
grid_search.best_estimator_


RandomForestClassifier(criterion='entropy', max_depth=500, max_features='log2',
                       min_samples_leaf=5, min_samples_split=3,
                       n_estimators=1500)

In [31]:
best_grid=grid_search.best_estimator_


In [32]:
best_grid

RandomForestClassifier(criterion='entropy', max_depth=500, max_features='log2',
                       min_samples_leaf=5, min_samples_split=3,
                       n_estimators=1500)

In [33]:
y_pred=best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[97 10]
 [12 35]]
Accuracy Score 0.8571428571428571
Classification report:               precision    recall  f1-score   support

           0       0.89      0.91      0.90       107
           1       0.78      0.74      0.76        47

    accuracy                           0.86       154
   macro avg       0.83      0.83      0.83       154
weighted avg       0.86      0.86      0.86       154

