RF Hyperparameters


In [27]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [28]:
df=pd.read_csv('heart1.csv')

In [29]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [42]:
df.shape

(303, 14)

In [43]:
X=df.iloc[:,0:13]
y=df.iloc[:,-1]

In [44]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [45]:
print(x_train.shape)
print(x_test.shape)

(242, 13)
(61, 13)


In [46]:
rf=RandomForestClassifier()
lr=LogisticRegression()
svc=SVC()
gb=GradientBoostingClassifier()

In [47]:
rf.fit(x_train,y_train)

RandomForestClassifier()

In [48]:
y_pred=rf.predict(x_test)

In [49]:
accuracy_score(y_test,y_pred)

0.8524590163934426

In [50]:
lr.fit(x_train,y_train)
y_pred1=lr.predict(x_test)
accuracy_score(y_test,y_pred1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8852459016393442

In [51]:
gb.fit(x_train,y_train)
y_pred2=gb.predict(x_test)
accuracy_score(y_test,y_pred2)

0.7704918032786885

In [52]:
svc.fit(x_train,y_train)
y_pred3=svc.predict(x_test)
accuracy_score(y_test,y_pred3)

0.7049180327868853

Now, Let's tune the random forest by passing hyperparameters

In [53]:
rf=RandomForestClassifier(max_samples=0.75,random_state=42)
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.9016393442622951

# Grid_Search CV

In [54]:
# Number of trees in random forest
n_estimators=[20,60,100,120]

#Number of features to consider at every split
max_features =[0.2,0.6,1.0]

#Maximum number of levels in tree
max_depth=[2,8,None]

#Number of samples
max_samples=[0.5,0.75,1.0]

In [55]:
param_grid={'n_estimators':n_estimators,
           'max_features':max_features,
           'max_depth':max_depth,
           'max_samples':max_samples
           }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [56]:
rf=RandomForestClassifier()

In [57]:
from sklearn.model_selection import GridSearchCV

rf_grid=GridSearchCV(estimator=rf,
                    param_grid=param_grid,
                    cv=5,
                    verbose=2,
                    n_jobs=-1)

In [58]:
rf_grid.fit(x_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


 0.81802721 0.81819728        nan        nan        nan        nan
 0.79319728 0.81394558 0.81394558 0.82636054 0.80552721 0.79328231
 0.80561224 0.81819728        nan        nan        nan        nan
 0.78086735 0.80153061 0.80595238 0.80986395 0.79770408 0.77678571
 0.78920068 0.80986395        nan        nan        nan        nan
 0.82227891 0.81794218 0.81411565 0.81403061 0.83886054 0.81377551
 0.82219388 0.80161565        nan        nan        nan        nan
 0.79744898 0.80569728 0.81811224 0.80569728 0.78511905 0.80161565
 0.78928571 0.80569728        nan        nan        nan        nan
 0.79311224 0.79345238 0.80569728 0.81411565 0.78545918 0.79744898
 0.78511905 0.79336735        nan        nan        nan        nan
 0.80144558 0.79744898 0.81394558 0.82219388 0.80986395 0.82219388
 0.81377551 0.80986395        nan        nan        nan        nan
 0.80586735 0.82219388 0.80569728 0.80561224 0.79328231 0.79736395
 0.79744898 0.80178571        nan        nan        nan       

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [2, 8, None],
                         'max_features': [0.2, 0.6, 1.0],
                         'max_samples': [0.5, 0.75, 1.0],
                         'n_estimators': [20, 60, 100, 120]},
             verbose=2)

In [60]:
rf_grid.best_params_

{'max_depth': 8, 'max_features': 0.2, 'max_samples': 0.75, 'n_estimators': 20}

In [61]:
rf_grid.best_score_

0.8388605442176871

# RandomSearchCV

In [71]:
# Number of trees in random forest
n_estimators=[20,60,100,120]

#Number of features to consider at every split
max_features =[0.2,0.6,1.0]

#Maximum number of levels in tree
max_depth=[2,8,None]

#Number of samples
max_samples=[0.5,0.75,1.0]

#Bootstrap samples
bootstrap=[True,False]

#Minimum number of samples required to split a node
min_samples_split=[2,5]

#Minimum number of samples required at each leaf node
min_samples_leaf=[1,2]

In [72]:
param_grid={'n_estimators':n_estimators,
           'max_features':max_features,
           'max_depth':max_depth,
           'max_samples':max_samples,
            'bootstrap':bootstrap,
            'min_samples_split':min_samples_split,
            'min_samples_leaf':min_samples_leaf
           }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [73]:
from sklearn.model_selection import RandomizedSearchCV

In [74]:
rf_grid=RandomizedSearchCV(estimator=rf,
                          param_distributions=param_grid,
                          cv=5,verbose=2,n_jobs=-1)

In [75]:
rf_grid.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 0.78095238 0.75612245        nan        nan]


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [2, 8, None],
                                        'max_features': [0.2, 0.6, 1.0],
                                        'max_samples': [0.5, 0.75, 1.0],
                                        'min_samples_leaf': [1, 2],
                                        'min_samples_split': [2, 5],
                                        'n_estimators': [20, 60, 100, 120]},
                   verbose=2)

In [76]:
rf_grid.best_params_

{'n_estimators': 120,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_samples': 0.75,
 'max_features': 1.0,
 'max_depth': 2,
 'bootstrap': True}

In [77]:
rf_grid.best_score_

0.8181972789115646