In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [94]:
df= pd.read_csv('https://github.com/campusx-official/100-days-of-machine-learning/raw/main/day65-random-forest/heart.csv')

In [95]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [96]:
df.shape

(303, 14)

In [97]:
X=df.iloc[:,0:13]
y= df.iloc[:,-1]

In [98]:
import warnings
warnings.filterwarnings("ignore")

In [99]:
X_train, X_test, y_train, y_test= train_test_split(X,y, random_state= 42, test_size=0.2)

In [100]:
print(X_train.shape)
print(X_test.shape)

(242, 13)
(61, 13)


In [101]:
rf= RandomForestClassifier(max_samples=0.75, random_state=42)
gb= GradientBoostingClassifier()
svc= SVC()
lr= LogisticRegression()

In [102]:
rf.fit(X_train, y_train)
y_pred= rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9016393442622951

In [103]:
gb.fit(X_train, y_train)
y_pred= gb.predict(X_test)
accuracy_score(y_test, y_pred)

0.7704918032786885

In [104]:
svc.fit(X_train, y_train)
y_pred= svc.predict(X_test)
accuracy_score(y_test, y_pred)

0.7049180327868853

In [105]:
lr.fit(X_train, y_train)
y_pred= lr.predict(X_test)
accuracy_score(y_test, y_pred)

0.8852459016393442

In [106]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(RandomForestClassifier(max_samples=0.75), X,y, cv=10, scoring='accuracy'))

0.8381720430107527

**Grid Search CV**

Use Hyperparameter Tuning in Random Forest to further enhance the model

In [107]:
# No of trees in random forest
n_estimators= [20,60,100,120]

# Number of features to consider at every split
max_features= [0.2,0.6,1.0]

# Max number of levels in Tree
max_depth= [2,8, None]

# Number of Samples
max_samples= [0.5,0.75,1.0]


# 108 Different combinations would be tried out

In [108]:
param_grid= {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'max_samples':max_samples


}

In [109]:
rf= RandomForestClassifier()

In [110]:
from sklearn.model_selection import GridSearchCV

rf_grid= GridSearchCV(estimator=rf,
                      param_grid=param_grid,
                      cv=5,
                      verbose=2,
                      n_jobs=-1
                      )

In [111]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [112]:
rf_grid.best_params_

{'max_depth': 2, 'max_features': 0.2, 'max_samples': 0.5, 'n_estimators': 60}

In [113]:
rf_grid.best_score_

0.8346088435374149

**RandomSearch CV**

In [123]:
# No of trees in random forest
n_estimators= [20,60,100,120]

# Number of features to consider at every split
max_features= [0.2,0.6,1.0]

# Max number of levels in Tree
max_depth= [2,8, None]

# Number of Samples
max_samples= [0.5,0.75,1.0]

# Bootsrap Samples
bootstrap= [True, False]

# Minimum  number of samples required to split a node
min_samples_split= [2,5]

# Minimum number of samples required at each leaf node
min_samples_leaf= [1,2]

In [124]:
param_grid= {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'max_samples':max_samples,
    'bootstrap':bootstrap,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf


}

In [125]:
rf= RandomForestClassifier()

In [128]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1)

In [129]:
rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [132]:
rf_grid.best_params_

{'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_samples': 0.5,
 'max_features': 0.2,
 'max_depth': None,
 'bootstrap': True}

In [134]:
rf_grid.best_score_

0.8348251028806585