In [35]:
import numpy as np

### PREPROCESS ###
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
### PREPROCESS ###

### MODELING ###
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
import xgboost as xgb
from sklearn.base import BaseEstimator, ClassifierMixin
### MODELING ###

# EXTRAS #
from sklearn.metrics import accuracy_score, classification_report


In [23]:
# Loading the dataset
df = pd.read_csv('../data/heart.csv')

In [24]:
df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [36]:
# Preprocess function
def preprocess(data):
    data = data.copy()

    # one-hot encoding the categorical columns
    encoder = OneHotEncoder(drop='first', sparse_output=False)  # drop first: elkerüli a multikollinearitást
    encoded_columns = encoder.fit_transform(data[['Sex', 'ExerciseAngina', 'ChestPainType', 'RestingECG', 'ST_Slope']])

    # creating a new dataframe with the encoded columns
    encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(
        ['Sex', 'ExerciseAngina', 'ChestPainType', 'RestingECG', 'ST_Slope']))

    # dropping the original columns
    data = data.drop(['Sex', 'ExerciseAngina', 'ChestPainType', 'RestingECG', 'ST_Slope'], axis=1)
    # concatenating the original and encoded dataframes
    data = pd.concat([data, encoded_df], axis=1)

    # splitting the data to train and test
    x = data.drop('HeartDisease', axis=1)
    y = data['HeartDisease']

    x_train, x_test, y_traing, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=42)

    # scaling the data
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    return x_train, x_test, y_traing, y_test


In [37]:
# Preprocessing the data
x_train, x_test, y_train, y_test = preprocess(df)

In [38]:
# Checking the datas
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((734, 15), (184, 15), (734,), (184,))

In [28]:
# Printing the first 5 rows of the training data
x_train[:5]

array([[-1.24506731, -0.70898547,  0.372803  ,  1.84260945,  2.28435288,
        -0.09706109,  0.54060477, -0.83846064, -0.48333575,  1.87905939,
        -0.21350421,  0.80817891, -0.48762079, -1.03325003, -0.84792072],
       [-1.8862362 , -0.16628515,  0.08614581, -0.5427086 ,  1.65224147,
        -0.83628643,  0.54060477, -0.83846064, -0.48333575,  1.87905939,
        -0.21350421,  0.80817891, -0.48762079, -1.03325003,  1.17935554],
       [ 0.25099346,  0.91911549,  0.12313384,  1.84260945, -0.44162756,
         0.08774524,  0.54060477,  1.19266183, -0.48333575, -0.53218116,
        -0.21350421,  0.80817891, -0.48762079,  0.96781995, -0.84792072],
       [-1.77937472, -0.16628515,  0.10463982, -0.5427086 ,  0.22999081,
        -0.83628643, -1.84978019, -0.83846064, -0.48333575,  1.87905939,
        -0.21350421,  0.80817891, -0.48762079, -1.03325003,  1.17935554],
       [-0.28331396, -0.70898547, -1.84647842,  1.84260945, -1.27127378,
        -0.83628643,  0.54060477, -0.83846064, 

In [29]:
# Printing the first 5 rows of the testing data
x_test[:5]

array([[ 0.99902384,  0.37641517, -0.04331227, -0.5427086 ,  1.69174843,
        -0.83628643, -1.84978019, -0.83846064,  2.06895518, -0.53218116,
        -0.21350421,  0.80817891, -0.48762079, -1.03325003,  1.17935554],
       [-0.06959099,  0.64776533,  2.94347064, -0.5427086 , -0.24409275,
        -0.83628643,  0.54060477, -0.83846064, -0.48333575,  1.87905939,
        -0.21350421,  0.80817891, -0.48762079,  0.96781995, -0.84792072],
       [ 1.2127468 ,  1.46181581, -1.84647842,  1.84260945, -0.56014845,
         0.27255158,  0.54060477, -0.83846064, -0.48333575, -0.53218116,
        -0.21350421, -1.23734978,  2.05077394,  0.96781995, -0.84792072],
       [ 0.25099346, -0.16628515, -1.84647842, -0.5427086 , -0.56014845,
         0.08774524,  0.54060477,  1.19266183, -0.48333575, -0.53218116,
        -0.21350421, -1.23734978, -0.48762079,  0.96781995, -0.84792072],
       [ 0.03727049, -1.36022586,  1.0108464 , -0.5427086 ,  0.78308829,
        -0.83628643,  0.54060477, -0.83846064, 

In [30]:
# Printing the first 5 rows of the training labels
y_train[:5]

795    0
25     0
84     1
10     0
344    1
Name: HeartDisease, dtype: int64

In [31]:
# Printing the first 5 rows of the testing labels
y_test[:5]

668    0
30     1
377    1
535    1
807    0
Name: HeartDisease, dtype: int64

In [39]:
# Creating custom XGBoost model

class CustomXGBClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, **kwargs):
        self.model = xgb.XGBClassifier(**kwargs)

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def __sklearn_tags__(self):
        return self.model.__sklearn_tags__()

In [40]:
# all the models I will use
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gaussion Naive Bayes': GaussianNB(),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    'XGBoost': CustomXGBClassifier(),
}

In [41]:
# applying the models and getting the results
for name, model in models.items():
    pipeline = make_pipeline(StandardScaler(), model)

    # fitting the model
    pipeline.fit(x_train, y_train)

    # predicting the test data
    y_pred = pipeline.predict(x_test)

    # calculating the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name}: {accuracy}')
    print(classification_report(y_test, y_pred))
    print("-" * 50)

Logistic Regression: 0.8532608695652174
              precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184

--------------------------------------------------
Decision Tree: 0.8369565217391305
              precision    recall  f1-score   support

           0       0.80      0.82      0.81        77
           1       0.87      0.85      0.86       107

    accuracy                           0.84       184
   macro avg       0.83      0.83      0.83       184
weighted avg       0.84      0.84      0.84       184

--------------------------------------------------
Random Forest: 0.8695652173913043
              precision    recall  f1-score   support

           0       0.84      0.84      0.84        77
           1       0.89    

