In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
df = pd.read_csv('/content/heart.csv')

In [40]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [41]:
df.shape

(303, 14)

In [42]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [43]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [44]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression


In [45]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [46]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(242, 13)
(61, 13)
(242,)
(61,)


In [47]:
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
svc = SVC()
lr = LogisticRegression()

In [48]:
rf.fit(X_train,y_train)
gb.fit(X_train,y_train)
svc.fit(X_train,y_train)
lr.fit(X_train,y_train)

In [49]:
y_pred_rf = rf.predict(X_test)
y_pred_gb = gb.predict(X_test)
y_pred_svc = svc.predict(X_test)
y_pred_lr = lr.predict(X_test)

In [50]:
acc_rf = rf.score(X_test,y_test)
acc_gb = gb.score(X_test,y_test)
acc_svc = svc.score(X_test,y_test)
acc_lr = lr.score(X_test,y_test)

In [51]:
print("rf_acc",acc_rf)
print("gb_acc",acc_gb)
print("svc_acc",acc_svc)
print("lr_acc",acc_lr)

rf_acc 0.8360655737704918
gb_acc 0.7704918032786885
svc_acc 0.7049180327868853
lr_acc 0.8852459016393442


In [52]:
from sklearn.model_selection import cross_val_score

print("rf cross_val_score",np.mean(cross_val_score(RandomForestClassifier(),X,y,cv=5,scoring='accuracy')))
print("gb cross_val_score",np.mean(cross_val_score(GradientBoostingClassifier(),X,y,cv=5,scoring='accuracy')))
print("svc cross_val_score",np.mean(cross_val_score(SVC(),X,y,cv=5,scoring='accuracy')))
print("lr cross_val_score",np.mean(cross_val_score(LogisticRegression(),X,y,cv=5,scoring='accuracy')))



rf cross_val_score 0.8248633879781421
gb cross_val_score 0.8083060109289617
svc cross_val_score 0.6434972677595628
lr cross_val_score 0.8249180327868852


## GridSearch CV

In [53]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# 108 diff random forest train

In [54]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [55]:
rf = RandomForestClassifier()

In [57]:
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(
    estimator = rf,
    param_grid = param_grid,
    cv = 5,
    verbose=2,
    n_jobs=-1
)

In [58]:
rf_grid.fit(X_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [59]:
rf_grid.best_params_

{'max_depth': 2, 'max_features': 0.2, 'max_samples': 0.5, 'n_estimators': 120}

In [60]:
rf_grid.best_score_

0.8552721088435373

## RandomSearch CV

In [61]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# Bootstrap samples
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [62]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples,
              'bootstrap':bootstrap,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [64]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = RandomizedSearchCV(
    estimator = rf,
    param_distributions = param_grid,
    cv = 5,
    verbose=2,
    n_jobs=-1
)

In [65]:
rf_grid.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [66]:
rf_grid.best_params_

{'n_estimators': 120,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_samples': 0.75,
 'max_features': 1.0,
 'max_depth': 2,
 'bootstrap': True}

In [67]:
rf_grid.best_score_

0.7975340136054422

## OOB Score

In [68]:
rf = RandomForestClassifier(oob_score=True)

In [69]:
rf.fit(X_train,y_train)

In [70]:
rf.oob_score_

0.8388429752066116