In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("heart.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [5]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [6]:
df.shape

(303, 14)

In [7]:
df.ndim

2

In [8]:
df.isnull().sum().sort_values(ascending=False)

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [9]:
df.duplicated().sum()

1

In [10]:
df.nunique().sort_values(ascending=False)

chol        152
thalach      91
trestbps     49
age          41
oldpeak      40
ca            5
cp            4
thal          4
restecg       3
slope         3
sex           2
fbs           2
exang         2
target        2
dtype: int64

In [11]:
X = df.iloc[:, 0:-1]
y = df.iloc[:,-1]

In [12]:
print(X.shape)
print(y.shape)

(303, 13)
(303,)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(242, 13)
(242,)
(61, 13)
(61,)


In [14]:
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
svc = SVC()
lr = LogisticRegression()

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy for Random Forest: ", accuracy_score(y_test, y_pred)*100)

gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
print("Accuracy for Gradient Boosting: ", accuracy_score(y_test, y_pred)*100)

svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print("Accuracy for Support Vector Machine: ", accuracy_score(y_test, y_pred)*100)

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Accuracy for Logistic Regression: ", accuracy_score(y_test, y_pred)*100)



Accuracy for Random Forest:  86.88524590163934
Accuracy for Gradient Boosting:  77.04918032786885
Accuracy for Support Vector Machine:  70.49180327868852
Accuracy for Logistic Regression:  88.52459016393442


In [15]:
from sklearn.model_selection import cross_val_score
cv_rf = np.mean(cross_val_score(RandomForestClassifier(),X,y,cv=10,scoring="accuracy"))
print("The Cross val score of Random Forest is ", cv_rf*100)

The Cross val score of Random Forest is  82.15053763440861


In [16]:
cv_lr = np.mean(cross_val_score(LogisticRegression(),X,y,cv=10,scoring="accuracy"))
print("The Cross val score of Random Forest is ", cv_lr*100)

The Cross val score of Random Forest is  83.16129032258065


#### After cross val score the accuracy of the two algos are nearly same

## Hyper parameter tuning of Random Forest

In [17]:
rf = RandomForestClassifier(max_samples=0.75, random_state=42) # max_sample is the no of rows for each base model
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy for Random Forest after tuning: ", accuracy_score(y_test, y_pred)*100)

Accuracy for Random Forest after tuning:  90.1639344262295


In [18]:
rf = RandomForestClassifier(max_samples=0.75, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9016393442622951

## There are 25 parameters for random forest, now how would i know what will be the optimum value for each hyperparamer
## For that Grid SearchCV is USed

In [19]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# NOw 108 diff combination of forest will be trained as per the combination given above
# Since there are only 4 paramaters given now it is a 4-dimension gird

In [20]:
param_grid = {"n_estimators": n_estimators,
              "max_features": max_features,
              "max_depth": max_depth,
              "max_samples":max_samples
             }

In [21]:
param_grid

{'n_estimators': [20, 60, 100, 120],
 'max_features': [0.2, 0.6, 1.0],
 'max_depth': [2, 8, None],
 'max_samples': [0.5, 0.75, 1.0]}

In [22]:
rg = RandomForestClassifier()
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(estimator=rf, # algo name
                      param_grid = param_grid, # grid of hyperparam
                      cv=5, # Cross val score
                      verbose=2, # ouput will be showed
                      n_jobs = -1) # Using all cores of my laptop
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [23]:
rf_grid.best_params_ # THis will show which params to keep. this is the best score

{'max_depth': 8, 'max_features': 0.2, 'max_samples': 0.75, 'n_estimators': 20}

In [24]:
rf = RandomForestClassifier(max_depth=8, max_features=0.2, max_samples=0.75, n_estimators=20)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy for Random Forest: ", accuracy_score(y_test, y_pred)*100)

Accuracy for Random Forest:  85.24590163934425


# Random Search CV
If the datasets is big and if more hyperparameters are to be tuned then in that case gridsearccv might be slower which is why randomized search is used.
Here randlomly only 10 combination is used which is why this is faster but the result might not be the best

In [25]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# Bootstrap samples
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [26]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples,
              'bootstrap':bootstrap,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [27]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = RandomizedSearchCV(estimator = rf, 
                       param_distributions = param_grid, 
                       cv = 5, 
                       verbose=2, 
                       n_jobs = -1)

In [28]:
rf_grid.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [29]:
rf_grid.best_params_

{'n_estimators': 60,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_samples': 0.5,
 'max_features': 0.6,
 'max_depth': 8,
 'bootstrap': True}

In [30]:
rf_grid.best_score_

0.8014455782312926