In [11]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, r2_score

In [12]:
df = pd.read_csv('heart.csv')

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [15]:
print(df.shape)

(303, 14)


In [16]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

With Default Parameters

In [22]:
model = RandomForestClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")

Accuracy Score : 0.8688524590163934


With HyperParameter Tuning

In [30]:
model = RandomForestClassifier(max_samples=0.75, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")

Accuracy Score : 0.9016393442622951


Grid Search CV

In [42]:
# Number of trees in random Forest
n_estimators = [20, 50, 100, 150, 200, 300]

# Number of features to consider at every split
max_features = [0.1, 0.2, 0.5, 0.8, 1.0]

# Maximum number if levels in decision tree
max_depth = [2, 8, 10, 15, 20, None]

# Number of samples
max_samples = [0.4, 0.5, 0.75, 0.8, 1.0]

In [43]:
param_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'max_features': max_features,
    'max_samples' : max_samples
}

In [44]:
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()

model_grid = GridSearchCV(estimator = model, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

model_grid.fit(X_train, y_train)

y_pred = model_grid.predict(X_test)

print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")

Fitting 5 folds for each of 900 candidates, totalling 4500 fits
Accuracy Score : 0.8852459016393442


In [45]:
model_grid.best_params_

{'max_depth': 15, 'max_features': 0.2, 'max_samples': 0.5, 'n_estimators': 200}

In [46]:
model_grid.best_score_

0.8510204081632653

Random Search CV

In [47]:
# Number of trees in random Forest
n_estimators = [20, 60, 100, 120]

# Number of features to consider at every split
max_features = [0.2, 0.4, 0.5, 0.6, 0.8, 1]

# Maximum number if levels in decision tree
max_depth = [2, 8, 10, None]

# Number of samples
max_samples = [0.4, 0.5, 0.75, 1.0]

# BootStrap Samples
bootstrap = [True, False]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [48]:
param_grid = {
    'n_estimators' : n_estimators,
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'min_samples_leaf' : min_samples_leaf,
    'max_features' : max_features,
    'bootstrap' : bootstrap,
    'max_samples' : max_samples
}

In [49]:
from sklearn.model_selection import RandomizedSearchCV

model_grid = RandomizedSearchCV(estimator = model, param_distributions=param_grid, cv=5, verbose=2, n_jobs=-1)

model_grid.fit(X_train, y_train)

y_pred = model_grid.predict(X_test)

print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy Score : 0.8524590163934426


45 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\sklearn\ensemble\_forest.py", line 397, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.

        nan        nan        nan        nan]


In [50]:
model_grid.best_params_

{'n_estimators': 20,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_samples': 1.0,
 'max_features': 1,
 'max_depth': 8,
 'bootstrap': True}

In [51]:
model_grid.best_score_

0.8388605442176871