# Hyperparameter Optimization Techinique

### First Model Creation 

In [96]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#### load dataset

In [97]:
df=pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [98]:
df['Glucose'] = np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin'] = np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])
df['BloodPressure'] = np.where(df['BloodPressure']==0,df['BloodPressure'].median(),df['BloodPressure'])
df['BMI'] = np.where(df['BMI']==0,df['BMI'].median(),df['BMI'])
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148.0,72.0,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66.0,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64.0,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [99]:
x=df.drop('Class',axis=1)
y=df['Class']

In [100]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,30.5,33.6,0.627,50
1,1,85.0,66.0,29.0,30.5,26.6,0.351,31
2,8,183.0,64.0,23.0,30.5,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33


In [101]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Class, dtype: int64

##### Perform train test split

In [102]:
from sklearn.model_selection import train_test_split,GridSearchCV
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=21)

In [103]:
from sklearn.ensemble import RandomForestClassifier
rf_cls = RandomForestClassifier(n_estimators=10).fit(x_train,y_train)
prediction = rf_cls.predict(x_test)

In [104]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[85  9]
 [36 24]]
0.7077922077922078
              precision    recall  f1-score   support

           0       0.70      0.90      0.79        94
           1       0.73      0.40      0.52        60

    accuracy                           0.71       154
   macro avg       0.71      0.65      0.65       154
weighted avg       0.71      0.71      0.68       154



#### The main parameter used by a RandomForestClassifier are :

- Criterion = used to evaluate quality of split
- max_depth = maximum number of levels allowed in each tree
- max_featured = maximum number of features considered when splitting node
- min_samples_leaf = minimum samples which  can be stored in a tree leaf
- min_samples_split = minimun samples necessary in a node to cause node splitting
- n_estimators = number of trees in ensemble

### -  Manual Hyperparameter Tuning
- selecting based on what i want without caring of model

In [105]:
rf_cls = RandomForestClassifier(n_estimators=200,criterion='gini',max_depth=6,
                                max_features=5,min_samples_leaf=3).fit(x_train,y_train)
prediction = rf_cls.predict(x_test)
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[83 11]
 [29 31]]
0.7402597402597403
              precision    recall  f1-score   support

           0       0.74      0.88      0.81        94
           1       0.74      0.52      0.61        60

    accuracy                           0.74       154
   macro avg       0.74      0.70      0.71       154
weighted avg       0.74      0.74      0.73       154



## Now, we use Hyperparameter tuning Methods 

##### (A,B,C,D)
- 1. GridSearchCV  -----> it check every area randomly(C,A,D,B)
- 2. RandomizedSearchCV -----> it predict some area (B,C)

for better result we use RandomizedSearchCV first then we apply GridSearchCV

### 1. RandomizedSearchCV

In [120]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(200,2000,10)]
max_features = ['auto','sqrt','log2']
max_depth = [int(x) for x in np.linspace(10,1000,10)]
min_samples_split = [1,2,3,4,5,6,7,8,9,10]
min_samples_leaf = [1,2,3,4,5,6,7,8,9,10]

In [121]:
#Create random Grid
random_grid = {'n_estimators':n_estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf,
              'criterion':['entropy','gini']
              }
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'criterion': ['entropy', 'gini']}


In [122]:
rf = RandomForestClassifier()
rf_randomCV = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                                random_state=100,n_jobs=-1)
rf_randomCV.fit(x_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   50.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.7min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10],
                                        'min_samples_split': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
   

In [123]:
rf_randomCV.best_params_

{'n_estimators': 1600,
 'min_samples_split': 10,
 'min_samples_leaf': 3,
 'max_features': 'sqrt',
 'max_depth': 450,
 'criterion': 'entropy'}

In [124]:
rf_randomCV_grid = rf_randomCV.best_estimator_

In [125]:
rf_randomCV_grid

RandomForestClassifier(criterion='entropy', max_depth=450, max_features='sqrt',
                       min_samples_leaf=3, min_samples_split=10,
                       n_estimators=1600)

In [126]:
prediction = rf_randomCV_grid.predict(x_test)
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[86  8]
 [28 32]]
0.7662337662337663
              precision    recall  f1-score   support

           0       0.75      0.91      0.83        94
           1       0.80      0.53      0.64        60

    accuracy                           0.77       154
   macro avg       0.78      0.72      0.73       154
weighted avg       0.77      0.77      0.75       154



###### Before Hyperparameter tuning accuracy is "0.7402" and after Hyperparameter tuning accuracy is "0.7532"

### 2. Now, apply GridSearchCV

In [127]:
rf_randomCV.best_params_

{'n_estimators': 1600,
 'min_samples_split': 10,
 'min_samples_leaf': 3,
 'max_features': 'sqrt',
 'max_depth': 450,
 'criterion': 'entropy'}

In [129]:
param_grid = {
    'criterion':[rf_randomCV.best_params_['criterion']],
    'max_depth':[rf_randomCV.best_params_['max_depth']],
    'max_features':[rf_randomCV.best_params_['max_features']],
    'min_samples_leaf':[rf_randomCV.best_params_['min_samples_leaf'],
                       rf_randomCV.best_params_['min_samples_leaf']+2,
                       rf_randomCV.best_params_['min_samples_leaf']+4],
    'min_samples_split':[rf_randomCV.best_params_['min_samples_split']-2,
                        rf_randomCV.best_params_['min_samples_split']-1,
                        rf_randomCV.best_params_['min_samples_split'],
                        rf_randomCV.best_params_['min_samples_split']+1,
                        rf_randomCV.best_params_['min_samples_split']+2],
    'n_estimators':[rf_randomCV.best_params_['n_estimators']-200,
                   rf_randomCV.best_params_['n_estimators']-100,
                   rf_randomCV.best_params_['n_estimators'],
                   rf_randomCV.best_params_['n_estimators']+100,
                   rf_randomCV.best_params_['n_estimators']+200],
}
print(param_grid)

{'criterion': ['entropy'], 'max_depth': [450], 'max_features': ['sqrt'], 'min_samples_leaf': [3, 5, 7], 'min_samples_split': [8, 9, 10, 11, 12], 'n_estimators': [1400, 1500, 1600, 1700, 1800]}



#### Fit GridSearchCV to data

In [130]:
rf =RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(x_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:  7.8min finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy'], 'max_depth': [450],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [3, 5, 7],
                         'min_samples_split': [8, 9, 10, 11, 12],
                         'n_estimators': [1400, 1500, 1600, 1700, 1800]},
             verbose=2)

In [131]:
rf_grid = grid_search.best_estimator_

In [132]:
rf_grid

RandomForestClassifier(criterion='entropy', max_depth=450, max_features='sqrt',
                       min_samples_leaf=3, min_samples_split=8,
                       n_estimators=1600)

In [133]:
prediction = rf_grid.predict(x_test)
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[86  8]
 [29 31]]
0.7597402597402597
              precision    recall  f1-score   support

           0       0.75      0.91      0.82        94
           1       0.79      0.52      0.63        60

    accuracy                           0.76       154
   macro avg       0.77      0.72      0.72       154
weighted avg       0.77      0.76      0.75       154



## Result

#### Before Hyperparameter tuning accuracy is "0.71"
#### Manual Hyperparameter tuning accuracy is "0.74" 
#### RandomSearchCV tuning accuracy is "0.77" 
#### GribSearchCV accuracy is "0.77"

Thanks You ever