In [48]:
import pandas as pd
df=pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [49]:
import numpy as np
df['Glucose']=np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin']=np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness']=np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [50]:
#### Independent And Dependent features
x=df.drop('Outcome',axis=1)
y=df['Outcome']

In [51]:
#### Train Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=0)

In [52]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [53]:
n_estimators=[int(x) for x in np.linspace(200,2000,10)]

In [54]:
max_features=['auto','sqrt','log2']

In [55]:
max_depth=[int(x) for x in np.linspace(10,1000,10)]

In [56]:
min_samples_split=[1,3,4,5,7,9]

In [57]:
min_samples_leaf=[1,2,4,6,8]

In [58]:
random_grid = {'n_estimators':n_estimators,
               'max_features':max_features,
               'max_depth':max_depth,
               'min_samples_split':min_samples_split,
               'min_samples_leaf':min_samples_leaf,
               'criterion':['entropy','gini']
}

In [59]:
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000],
 'min_samples_split': [1, 3, 4, 5, 7, 9],
 'min_samples_leaf': [1, 2, 4, 6, 8],
 'criterion': ['entropy', 'gini']}

In [60]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,\
                           random_state=100,n_jobs=-1)
randomcv.fit(x_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.7min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [1, 3, 4, 5, 7, 9],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [61]:
randomcv.best_params_

{'n_estimators': 400,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 890,
 'criterion': 'entropy'}

In [62]:
R=randomcv.best_estimator_

In [63]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
y_pred=R.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.8376623376623377
[[95 12]
 [13 34]]
              precision    recall  f1-score   support

           0       0.88      0.89      0.88       107
           1       0.74      0.72      0.73        47

    accuracy                           0.84       154
   macro avg       0.81      0.81      0.81       154
weighted avg       0.84      0.84      0.84       154



# grid search cv

In [64]:
randomcv.best_params_

{'n_estimators': 400,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 890,
 'criterion': 'entropy'}

In [66]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [randomcv.best_params_['criterion']],
    'max_depth': [randomcv.best_params_['max_depth']],
    'max_features': [randomcv.best_params_['max_features']],
    'min_samples_leaf': [randomcv.best_params_['min_samples_leaf'], 
                         randomcv.best_params_['min_samples_leaf']+2, 
                         randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [randomcv.best_params_['min_samples_split'] - 2,
                          randomcv.best_params_['min_samples_split'] - 1,
                          randomcv.best_params_['min_samples_split'], 
                          randomcv.best_params_['min_samples_split'] +1,
                          randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [randomcv.best_params_['n_estimators'] - 200, randomcv.best_params_['n_estimators'] - 100, 
                     randomcv.best_params_['n_estimators'], 
                     randomcv.best_params_['n_estimators'] + 100, randomcv.best_params_['n_estimators'] + 200]
}

print(param_grid)

{'criterion': ['entropy'], 'max_depth': [890], 'max_features': ['auto'], 'min_samples_leaf': [1, 3, 5], 'min_samples_split': [2, 3, 4, 5, 6], 'n_estimators': [200, 300, 400, 500, 600]}


In [81]:
#### Fit the grid_search to the data
r=RandomForestClassifier()
gridcv=GridSearchCV(estimator=r,param_grid=param_grid,cv=3,n_jobs=-1,verbose=2)
gridcv.fit(x_train,y_train)


Fitting 3 folds for each of 75 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:  1.6min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy'], 'max_depth': [890],
                         'max_features': ['auto'],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [2, 3, 4, 5, 6],
                         'n_estimators': [200, 300, 400, 500, 600]},
             verbose=2)

In [85]:
gridcv.best_params_

{'criterion': 'entropy',
 'max_depth': 890,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 300}

In [86]:
grid=gridcv.best_estimator_
grid

RandomForestClassifier(criterion='entropy', max_depth=890, min_samples_split=3,
                       n_estimators=300)

In [87]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
yp=grid.predict(x_test)
print(accuracy_score(y_test,yp))
print(confusion_matrix(y_test,yp))
print(classification_report(y_test,yp
                           ))

0.8246753246753247
[[95 12]
 [15 32]]
              precision    recall  f1-score   support

           0       0.86      0.89      0.88       107
           1       0.73      0.68      0.70        47

    accuracy                           0.82       154
   macro avg       0.80      0.78      0.79       154
weighted avg       0.82      0.82      0.82       154



# Automated hyperparameter tuning

In [91]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [93]:

! pip install tenserflow

ERROR: Could not find a version that satisfies the requirement tenserflow (from versions: none)
ERROR: No matching distribution found for tenserflow
