In [1]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv("heart_attack_prediction_dataset.csv")
df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Patient ID                       8763 non-null   object 
 1   Age                              8763 non-null   int64  
 2   Sex                              8763 non-null   object 
 3   Cholesterol                      8763 non-null   int64  
 4   Blood Pressure                   8763 non-null   object 
 5   Heart Rate                       8763 non-null   int64  
 6   Diabetes                         8763 non-null   int64  
 7   Family History                   8763 non-null   int64  
 8   Smoking                          8763 non-null   int64  
 9   Obesity                          8763 non-null   int64  
 10  Alcohol Consumption              8763 non-null   int64  
 11  Exercise Hours Per Week          8763 non-null   float64
 12  Diet                

In [9]:
df = df.drop(columns = ["Patient ID", "Continent", "Hemisphere"])

In [10]:
df[['systolic', 'diastolic']] = df['Blood Pressure'].str.split('/', expand=True).astype(int)

In [11]:
df = df.drop(columns = ["Blood Pressure"])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 24 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              8763 non-null   int64  
 1   Sex                              8763 non-null   object 
 2   Cholesterol                      8763 non-null   int64  
 3   Heart Rate                       8763 non-null   int64  
 4   Diabetes                         8763 non-null   int64  
 5   Family History                   8763 non-null   int64  
 6   Smoking                          8763 non-null   int64  
 7   Obesity                          8763 non-null   int64  
 8   Alcohol Consumption              8763 non-null   int64  
 9   Exercise Hours Per Week          8763 non-null   float64
 10  Diet                             8763 non-null   object 
 11  Previous Heart Problems          8763 non-null   int64  
 12  Medication Use      

In [13]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [14]:
columns_to_encode = ["Sex", "Diet", "Country"]
for col in columns_to_encode:
  df[col] = encoder.fit_transform(df[col])

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 24 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              8763 non-null   int64  
 1   Sex                              8763 non-null   int64  
 2   Cholesterol                      8763 non-null   int64  
 3   Heart Rate                       8763 non-null   int64  
 4   Diabetes                         8763 non-null   int64  
 5   Family History                   8763 non-null   int64  
 6   Smoking                          8763 non-null   int64  
 7   Obesity                          8763 non-null   int64  
 8   Alcohol Consumption              8763 non-null   int64  
 9   Exercise Hours Per Week          8763 non-null   float64
 10  Diet                             8763 non-null   int64  
 11  Previous Heart Problems          8763 non-null   int64  
 12  Medication Use      

In [16]:
X = df.drop(columns = ["Heart Attack Risk"])
y = df["Heart Attack Risk"]

**TRAINING AND ADABOOST CLASSIFIER**

In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint, uniform

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 43)

In [19]:
dt = DecisionTreeClassifier(random_state=42)
adaboost = AdaBoostClassifier(base_estimator=dt, random_state=43)

param_dist = {
    'base_estimator__max_depth': randint(1, 21),
    'n_estimators': randint(50, 501),
    'learning_rate': uniform(0.01, 1.0)
}

adaboost_rs = RandomizedSearchCV(estimator=adaboost, param_distributions=param_dist, n_iter=50, cv=5, random_state=42, verbose = 2, n_jobs=-1)

adaboost_rs.fit(X_train, y_train)

print("Best Parameters:", adaboost_rs.best_params_)
print("Best Cross-validation Accuracy:", adaboost_rs.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits




Best Parameters: {'base_estimator__max_depth': 1, 'learning_rate': 0.31424224295953773, 'n_estimators': 71}
Best Cross-validation Accuracy: 0.6398761684821266


In [30]:
y_pred = adaboost_rs.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score
print(f"AdaBoost accuracy score {accuracy_score(y_pred, y_test)}")

AdaBoost accuracy score 0.6417569880205363


In [36]:
y_pred_proba = adaboost_rs.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc}")

ROC AUC Score: 0.5647728237791932


**TRAINING A K NEIGHBORS CLASSIFIER**


In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [37]:
from sklearn.metrics import roc_auc_score
neighbors = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
best_roc_auc_score = 0
best_n_neighbors = 0
for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors=n)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    knn.fit(X_train, y_train)

    y_pred_knn = knn.predict(X_test)

    current_roc_auc_score = roc_auc_score(y_test, y_pred_knn)

    if current_roc_auc_score > best_roc_auc_score:
        best_roc_auc_score = current_roc_auc_score
        best_n_neighbors = n

print(f"Neighbors: {best_n_neighbors}, roc_auc_score: {best_roc_auc_score:.4f}")

Neighbors: 8, roc_auc_score: 0.5035


In [38]:
print(f"KNN accuracy score {accuracy_score(y_pred_knn, y_test)}")

KNN accuracy score 0.618938961779806


*AdaBoost has a slightly higher accuracy compared to kNN. While AdaBoost performs better in terms of accuracy, both models performed poorly due to class imbalance.***