In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('new_heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# 0 - Typical Angina
# 1 -  Atypical Angina
# 2 - Non-Anginal Pain
# 3 - Asymptomatic

df['cp'].value_counts()

0    143
2     87
1     50
3     23
Name: cp, dtype: int64

In [4]:
# 1: if FastingBS > 120 mg/dl, 0: otherwise

df['fbs'].value_counts()

0    258
1     45
Name: fbs, dtype: int64

In [5]:
# 0 - Normal
# 1 - ST
# 2 - LVH

df['restecg'].value_counts()

1    152
0    147
2      4
Name: restecg, dtype: int64

In [6]:
# Up: upsloping, Flat: flat, Down: downsloping

df['slope'].value_counts()

2    142
1    140
0     21
Name: slope, dtype: int64

In [7]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [8]:
df.shape

(303, 14)

In [9]:
df['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [12]:
X = df.drop(['target'], axis=1)
y = df['target']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)

In [14]:
rf = RandomForestClassifier()

rf.fit(X_train,y_train)

In [15]:
train_predictions = rf.predict(X_train)
test_predictioons = rf.predict(X_test)

In [16]:
training_accuracy = accuracy_score(y_train, train_predictions)
testing_accuracy = accuracy_score(y_test, test_predictioons)

In [17]:
print(training_accuracy)
print(testing_accuracy)

1.0
0.8032786885245902


# Using GridSearchCV

In [18]:
n_estimators = [20,60,100,120]
max_features = [0.2,0.6,1.0]
max_depth = [2,4,8,10]
max_samples = [0.5,0.75,1.0]

#There are a total of 144 combinations. Hence 144 random forests will be trained.

In [19]:
param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth,
    'max_samples' : max_samples
}

In [20]:
rf_grid = GridSearchCV(estimator=rf,
                      param_grid=param_grid,
                      cv = 6,
                      verbose=2,
                      n_jobs=-1)

In [21]:
rf_grid.fit(X_train,y_train)

Fitting 6 folds for each of 144 candidates, totalling 864 fits


In [22]:
rf_grid.best_score_

0.8470528455284553

In [23]:
rf_grid.best_params_

{'max_depth': 2, 'max_features': 0.2, 'max_samples': 0.75, 'n_estimators': 100}

In [29]:
new_rf = RandomForestClassifier(n_estimators=100,max_depth=2,max_features=0.2,max_samples=0.75)

In [30]:
new_rf.fit(X_train,y_train)

In [31]:
train_pred = new_rf.predict(X_train)
test_pred = new_rf.predict(X_test)

In [32]:
new_train_acc = accuracy_score(y_train, train_pred)
new_test_acc = accuracy_score(y_test, test_pred)

In [33]:
print("Training Score: ",new_train_acc)
print("Test Score: ", new_test_acc)

Training Score:  0.859504132231405
Test Score:  0.8360655737704918


# Using GridSearchCV, we have got the best hyperparameters for our model. The model is not overfitting like it did in the first case and is generalizing well. Hence using GridSearchCV, we can tune our model and can make it perform better. 