In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Convert strings to numbers

In [3]:
df['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [4]:
df = pd.concat([df, pd.get_dummies(df['ChestPainType'], prefix='ChestPainType').astype(int)], axis=1)
df.drop('ChestPainType', axis=1, inplace=True)

In [5]:
df['ExerciseAngina'] = df['ExerciseAngina'].map({'N': 0, 'Y': 1}) 

In [6]:
df['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [7]:
df['RestingECG'].unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['ST_Slope'] = le.fit_transform(df['ST_Slope'])
df['RestingECG'] = le.fit_transform(df['RestingECG'])

In [9]:
df['Sex'] = (df['Sex']).map({'M': 0, 'F': 1})
df['Sex'].unique()

array([0, 1])

In [10]:
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,0,140,289,0,1,172,0,0.0,2,0,0,1,0,0
1,49,1,160,180,0,1,156,0,1.0,1,1,0,0,1,0
2,37,0,130,283,0,2,98,0,0.0,2,0,0,1,0,0
3,48,1,138,214,0,1,108,1,1.5,1,1,1,0,0,0
4,54,0,150,195,0,1,122,0,0.0,2,0,0,0,1,0


## Remove outliers

In [11]:
import numpy as np
from scipy.stats import zscore

In [12]:
scores = np.abs(df.apply(zscore))
df = df[(scores < 3).all(axis=1)]
df.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,0,140,289,0,1,172,0,0.0,2,0,0,1,0,0
1,49,1,160,180,0,1,156,0,1.0,1,1,0,0,1,0
2,37,0,130,283,0,2,98,0,0.0,2,0,0,1,0,0
3,48,1,138,214,0,1,108,1,1.5,1,1,1,0,0,0
4,54,0,150,195,0,1,122,0,0.0,2,0,0,0,1,0


## Split data

### features vs target

In [13]:
features = df.drop('HeartDisease', axis=1)
target = df['HeartDisease']

In [14]:
features.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,0,140,289,0,1,172,0,0.0,2,0,1,0,0
1,49,1,160,180,0,1,156,0,1.0,1,0,0,1,0
2,37,0,130,283,0,2,98,0,0.0,2,0,1,0,0
3,48,1,138,214,0,1,108,1,1.5,1,1,0,0,0
4,54,0,150,195,0,1,122,0,0.0,2,0,0,1,0


#### Normalize features

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features = scaler.fit_transform(features)

features

array([[-1.43962926, -0.51496134,  0.47844751, ...,  1.99853748,
        -0.55300126,  0.        ],
       [-0.47502369,  1.94189333,  1.65189432, ..., -0.5003659 ,
         1.80831413,  0.        ],
       [-1.76116445, -0.51496134, -0.10827589, ...,  1.99853748,
        -0.55300126,  0.        ],
       ...,
       [ 0.38240348, -0.51496134, -0.10827589, ..., -0.5003659 ,
        -0.55300126,  0.        ],
       [ 0.38240348,  1.94189333, -0.10827589, ...,  1.99853748,
        -0.55300126,  0.        ],
       [-1.65398605, -0.51496134,  0.36110283, ..., -0.5003659 ,
         1.80831413,  0.        ]])

### Training vs testing

In [16]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)
len(X_train), len(X_test)

(597, 257)

## Train using Support Vector Machines

In [30]:
from sklearn.svm import SVC

In [31]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [32]:
svm_model.score(X_test, y_test)

0.8754863813229572

## Train using Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [35]:
rf_model.score(X_test, y_test)

0.8521400778210116

## Train using Bagging model with SVM estimator

In [36]:
from sklearn.ensemble import BaggingClassifier

In [37]:
bc_model = BaggingClassifier(
    estimator=SVC(),
    n_estimators=50,
    max_samples=0.8,
    oob_score=True,
    random_state=0
)
bc_model.fit(X_train, y_train)

In [38]:
bc_model.score(X_test, y_test)

0.8793774319066148

## Results Interpretation

Each model gave accuracy score as follows:

- Support Vector Classifier => 87.55%
- Random Forest Classifier => 85.21%
- Bagged Support Vector Classifier => 87.94%