In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('/content/drive/MyDrive/RandomForest/heart.csv')

In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:
df.shape

(303, 14)

In [None]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)

(242, 13)
(61, 13)


In [None]:
rf = RandomForestClassifier(max_samples=0.75, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.9016393442622951

Cross Validation Score

In [None]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(RandomForestClassifier(max_samples=0.75), X, y, cv=10,scoring='accuracy' ))

0.8312903225806452

### **GridSearchCV** to find optimal hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [20, 60, 100, 120]

# Number of features to consider at every split
max_features = [0.2, 0.6, 1.0]

# Maximum number of levels in tree
max_depth = [2, 8, None]

# Number of samples (data rows)
max_samples = [0.5, 0.75, 1.0]

In [None]:
# Parameter Grid
param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'max_samples': max_samples
   }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [None]:
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(estimator = rf, # the algo to be used
                       param_grid = param_grid, # the parameters to be tested
                       cv = 5, # traim each rf 5 times for cross validation
                       verbose = 2, #output would be shown at each train
                       n_jobs = -1) # Use all cores of my machine

In [None]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
rf_grid.best_params_ # the best parameters to use amongs those given

{'max_depth': 8, 'max_features': 0.2, 'max_samples': 0.75, 'n_estimators': 20}

In [None]:
rf_grid.best_score_ # The highest accuracy achieved

0.8386904761904763

### **RandomizedSearchCV**

Used Only when your dataset is huge. It will randomly select the hyperparamters given and test them and give out the best result obtained amongst different combinations used.

In [None]:
# Number of trees in random forest
n_estimators = [20, 60, 100, 120]

# Number of features to consider at every split
max_features = [0.2, 0.6, 1.0]

# Maximum number of levels in tree
max_depth = [2, 8, None]

# Number of samples (data rows)
max_samples = [0.5, 0.75, 1.0]

# Bootstrap samples
bootstrap = [True, False]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [None]:
# Parameter Grid
param_distributions = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'max_samples': max_samples,
    'bootstrap': bootstrap,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
   }

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestClassifier()
rf_ran_grid = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions,
                       cv = 5, verbose = 2, n_jobs = -1)

In [None]:
rf_ran_grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 397, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.

 0.79753401        nan 0.80986395 0.80170068]


In [None]:
rf_ran_grid.best_params_

{'n_estimators': 120,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_samples': 0.75,
 'max_features': 0.2,
 'max_depth': 2,
 'bootstrap': True}

In [None]:
rf_ran_grid.best_score_

0.8385204081632652

### Now Let's train the model with these hyperparamters

Also make use of OOB evaluation, using the left out samples to cross validate model

In [64]:
rf = RandomForestClassifier(n_estimators=120,max_features=0.2, max_samples=0.75,max_depth=2, bootstrap=True, min_samples_leaf=1, min_samples_split=2, oob_score=True)

In [65]:
rf.fit(X_train, y_train)

In [67]:
rf.oob_score_

0.8181818181818182

In [68]:
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8852459016393442