In [45]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import LabelEncoder

In [46]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [47]:
print(f'With outliers: {df.shape}')

age_outliers = df[(df['Age'] > df['Age'].mean() + 3 * df['Age'].std()) | (df['Age'] < df['Age'].mean() - 3 * df['Age'].std())]
df.drop(age_outliers.index, axis='index', inplace=True)

restingbp_outliers = df[(df['RestingBP'] > df['RestingBP'].mean() + 3 * df['RestingBP'].std()) | (df['RestingBP'] < df['RestingBP'].mean() - 3 * df['RestingBP'].std())]
df.drop(restingbp_outliers.index, axis='index', inplace=True)

cholesterol_outliers = df[(df['Cholesterol'] > df['Cholesterol'].mean() + 3 * df['Cholesterol'].std()) | (df['Cholesterol'] < df['Cholesterol'].mean() - 3 * df['Cholesterol'].std())]
df.drop(cholesterol_outliers.index, axis='index', inplace=True)

maxhr_outliers = df[(df['MaxHR'] > df['MaxHR'].mean() + 3 * df['MaxHR'].std()) | (df['MaxHR'] < df['MaxHR'].mean() - 3 * df['MaxHR'].std())]
df.drop(maxhr_outliers.index, axis='index', inplace=True)

oldpeak_outliers = df[(df['Oldpeak'] > df['Oldpeak'].mean() + 3 * df['Oldpeak'].std()) | (df['Oldpeak'] < df['Oldpeak'].mean() - 3 * df['Oldpeak'].std())]
df.drop(oldpeak_outliers.index, axis='index', inplace=True)

print(f'Without outliers: {df.shape}')

With outliers: (918, 12)
Without outliers: (899, 12)


In [48]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['ChestPainType'] = le.fit_transform(df['ChestPainType'])
df['RestingECG'] = le.fit_transform(df['RestingECG'])
df['ExerciseAngina'] = le.fit_transform(df['ExerciseAngina'])
df['ST_Slope'] = le.fit_transform(df['ST_Slope'])
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [49]:
X = df.drop('HeartDisease', axis='columns')
y = df['HeartDisease']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

In [51]:
svm_params = {
    'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

gscv = GridSearchCV(SVC(), svm_params, cv=10, return_train_score=False)
gscv.fit(X_scaled, y)

print(f'Best score: {gscv.best_score_}\nBest params: {gscv.best_params_}')

Best score: 0.859812734082397
Best params: {'C': 2, 'gamma': 'scale', 'kernel': 'rbf'}


In [52]:
svm = SVC(C=2, gamma='scale', kernel='rbf')
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.8777777777777778

In [53]:
bag_model = BaggingClassifier(
    estimator=SVC(C=2, gamma='scale', kernel='rbf'),
    n_estimators=100,
    oob_score=True
)

bag_model.fit(X_train, y_train)
bag_model.score(X_test, y_test)

0.8833333333333333

In [54]:
dt_params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random']
}

gscv = GridSearchCV(DecisionTreeClassifier(), dt_params, cv=10, return_train_score=False)
gscv.fit(X_scaled, y)

print(f'Best score: {gscv.best_score_}\nBest params: {gscv.best_params_}')

Best score: 0.79083645443196
Best params: {'criterion': 'log_loss', 'splitter': 'random'}


In [70]:
bag_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(criterion='log_loss', splitter='random'),
    n_estimators=100,
    oob_score=True
)

bag_model.fit(X_train, y_train)
bag_model.score(X_test, y_test)

0.8944444444444445