In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [5]:
df.shape

(918, 12)

### Treat outliers:
Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 or Z score < -3. In general, a Z-score of -3.0 to 3.0 suggests that a data point is within three standard deviations of its mean.

In this case, since all numeric values in dataset are positive, we only need to consider Z score > 3 as outliers.

In [6]:
numeric_col = ['RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
class_col = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# remove outliers
df_update = df 
for col in numeric_col:
    df_update = df_update[df_update[col] <= (df_update[col].mean() + 3*df_update[col].std())]
    print('After removing outliers of column {}, df shape is {}'.format(col, df_update.shape))

After removing outliers of column RestingBP, df shape is (911, 12)
After removing outliers of column Cholesterol, df shape is (908, 12)
After removing outliers of column FastingBS, df shape is (908, 12)
After removing outliers of column MaxHR, df shape is (908, 12)
After removing outliers of column Oldpeak, df shape is (902, 12)


In [8]:
df_final = df_update.copy()
df_final = pd.get_dummies(df_final, drop_first=True)
df_final.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1


In [9]:
X = df_final.drop('HeartDisease', axis = 1)
y = df_final.HeartDisease

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 30)

### Train with SVC alone

In [14]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(SVC(), X, y, cv=5)
scores.mean()

0.6895273173726213

### Train with bagging

In [17]:
from sklearn.ensemble import BaggingClassifier
bag_model = BaggingClassifier(base_estimator = SVC(), n_estimators = 100, oob_score = True, max_samples=0.8)
scores = cross_val_score(bag_model, X, y, cv=5)
scores.mean()

# No significant improvement in model performance

0.6839656230816453

### Train with Decision Tree alone, then with bagging 

In [19]:
from sklearn.tree import DecisionTreeClassifier
scores = cross_val_score(DecisionTreeClassifier(), X, y, cv=5)
scores.mean()

0.732707182320442

In [20]:
bag_dt = BaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators = 100, oob_score = True, max_samples=0.8)
scores = cross_val_score(bag_dt, X, y, cv=5)
scores.mean()

# Improvement was seen with Decision Tree as base model

0.7959177409453652

### Train with Random Forest, which uses bagging itself underneath 

In [21]:
from sklearn.ensemble import RandomForestClassifier
scores = cross_val_score(RandomForestClassifier(), X, y, cv=5)
scores.mean()

# Random forest gave even a better performance with 81.7% as score. 
# Underneath it used bagging where it sampled not only data rows but also the columns (or features)

0.8181276856967464