In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [23]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [24]:
xtrain,xtest,ytrain,ytest = train_test_split(df.drop(columns=['target']),df['target'],test_size=0.3,random_state=42)

In [25]:
rf = RandomForestClassifier()
rf.fit(xtrain,ytrain)

In [26]:
ypred = rf.predict(xtest)
accuracy_score(ytest,ypred)

0.8131868131868132

In [27]:
rf = RandomForestClassifier(max_samples=0.75,random_state=42)
rf.fit(xtrain,ytrain)

In [28]:
ypred = rf.predict(xtest)
accuracy_score(ytest,ypred)

0.8351648351648352

## Grid Search CV

In [29]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
parameters = {
    'n_estimators':n_estimators,
    'max_samples':max_samples,
    'max_depth':max_depth,
    'max_features': max_features
} 

search = GridSearchCV(RandomForestClassifier(),parameters,cv = 5)

In [31]:
search.fit(xtrain,ytrain)

In [32]:
search.best_params_

{'max_depth': None,
 'max_features': 0.2,
 'max_samples': 0.75,
 'n_estimators': 20}

In [33]:
search.best_score_

0.8442967884828351

# Random CV

In [34]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [35]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [36]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = RandomizedSearchCV(estimator = rf, 
                       param_distributions = param_grid, 
                       cv = 5, 
                       verbose=2, 
                       n_jobs = -1)

In [37]:
rf_grid.fit(xtrain,ytrain)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [38]:
rf_grid.best_params_

{'n_estimators': 60,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_samples': 0.75,
 'max_features': 0.2,
 'max_depth': 8}

In [39]:
rf_grid.best_score_

0.839424141749723