In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler,StandardScaler,OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error


In [2]:
df = pd.read_csv('../data/heart_attack_prediction_dataset.csv')
df.head(3)

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0


# Preparation (clean, encoding, dummies)

In [3]:
df=df.drop(columns=['Patient ID','Country'])

In [4]:
def clean_my_columns_titles(df):
    df.columns = df.columns.str.replace(" ","_").str.lower().str.strip() 
    return df   

In [5]:
clean_my_columns_titles(df)
df.head(3)

Unnamed: 0,age,sex,cholesterol,blood_pressure,heart_rate,diabetes,family_history,smoking,obesity,alcohol_consumption,...,stress_level,sedentary_hours_per_day,income,bmi,triglycerides,physical_activity_days_per_week,sleep_hours_per_day,continent,hemisphere,heart_attack_risk
0,67,Male,208,158/88,72,0,0,1,0,0,...,9,6.615001,261404,31.251233,286,0,6,South America,Southern Hemisphere,0
1,21,Male,389,165/93,98,1,1,1,1,1,...,1,4.963459,285768,27.194973,235,1,7,North America,Northern Hemisphere,0
2,21,Female,324,174/99,72,1,0,0,0,0,...,9,9.463426,235282,28.176571,587,4,4,Europe,Northern Hemisphere,0


In [6]:
df['systolic_pressure']=df['blood_pressure'].apply(lambda x: int(x.split("/")[0]))
df['diastolic_pressure']=df['blood_pressure'].apply(lambda x: int(x.split("/")[1]))

In [7]:
list_high = [0,120, 130,140,max(df.systolic_pressure)]
classif = [0, 1, 10, 100] # 0 is normal, 1 is elevated, 10 is hyper level 1, 100 is hyper level 2
list_low = [0,80, 90,100,max(df.diastolic_pressure)]

df['systolic_classes'] = pd.cut(df['systolic_pressure'], 4, labels=classif).astype(int)
df['diastolic_classes'] = pd.cut(df['diastolic_pressure'], 4, labels=classif).astype(int)

In [8]:
def check_pressure(x):
    if x >= 100:
        return 'D'
    elif x>= 10:
        return 'C'
    elif x>= 1:
        return 'B'
    elif x == 0:
        return 'A'

In [9]:
df['blood_pressure_class']= df['systolic_classes']+df['diastolic_classes']
df['blood_pressure_class']=df['blood_pressure_class'].apply(check_pressure)
df['blood_pressure_class'].value_counts()

blood_pressure_class
D    3925
C    2607
B    1645
A     586
Name: count, dtype: int64

In [None]:
# df['diet_label'] = df['diet'].replace(
#     {
#         'Healthy':0,
#         'Average':1,
#         'Unhealthy':2,
#         }
#     )

In [None]:
# df['sex'] = df['sex'].replace(
#     {'Female':0,
#     'Male':1}
# )

In [10]:
df = df.drop(columns=['systolic_pressure','diastolic_pressure','systolic_classes','diastolic_classes'])
df.head(3)

Unnamed: 0,age,sex,cholesterol,blood_pressure,heart_rate,diabetes,family_history,smoking,obesity,alcohol_consumption,...,sedentary_hours_per_day,income,bmi,triglycerides,physical_activity_days_per_week,sleep_hours_per_day,continent,hemisphere,heart_attack_risk,blood_pressure_class
0,67,Male,208,158/88,72,0,0,1,0,0,...,6.615001,261404,31.251233,286,0,6,South America,Southern Hemisphere,0,D
1,21,Male,389,165/93,98,1,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,North America,Northern Hemisphere,0,D
2,21,Female,324,174/99,72,1,0,0,0,0,...,9.463426,235282,28.176571,587,4,4,Europe,Northern Hemisphere,0,D


In [11]:
df_enc = df.copy()
df_enc

Unnamed: 0,age,sex,cholesterol,blood_pressure,heart_rate,diabetes,family_history,smoking,obesity,alcohol_consumption,...,sedentary_hours_per_day,income,bmi,triglycerides,physical_activity_days_per_week,sleep_hours_per_day,continent,hemisphere,heart_attack_risk,blood_pressure_class
0,67,Male,208,158/88,72,0,0,1,0,0,...,6.615001,261404,31.251233,286,0,6,South America,Southern Hemisphere,0,D
1,21,Male,389,165/93,98,1,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,North America,Northern Hemisphere,0,D
2,21,Female,324,174/99,72,1,0,0,0,0,...,9.463426,235282,28.176571,587,4,4,Europe,Northern Hemisphere,0,D
3,84,Male,383,163/100,73,1,1,1,0,1,...,7.648981,125640,36.464704,378,3,4,North America,Northern Hemisphere,0,D
4,66,Male,318,91/88,93,1,1,1,1,0,...,1.514821,160555,21.809144,231,1,5,Asia,Northern Hemisphere,0,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,60,Male,121,94/76,61,1,1,1,0,1,...,10.806373,235420,19.655895,67,7,7,Asia,Northern Hemisphere,0,B
8759,28,Female,120,157/102,73,1,0,0,1,0,...,3.833038,217881,23.993866,617,4,9,North America,Northern Hemisphere,0,D
8760,47,Male,250,161/75,105,0,1,1,1,1,...,2.375214,36998,35.406146,527,4,4,South America,Southern Hemisphere,1,D
8761,36,Male,178,119/67,60,1,0,1,0,0,...,0.029104,209943,27.294020,114,2,8,South America,Southern Hemisphere,0,B


In [12]:
#dummies

df_transformed = pd.merge(left=df_enc,
                            right=pd.get_dummies(df_enc[['sex','continent','hemisphere','diet','blood_pressure_class']],prefix=['sex','continent','hemisphere','diet','blood_pressure_class'],drop_first=True),
                            left_index=True,
                            right_index=True)
df_transformed

Unnamed: 0,age,sex,cholesterol,blood_pressure,heart_rate,diabetes,family_history,smoking,obesity,alcohol_consumption,...,continent_Australia,continent_Europe,continent_North America,continent_South America,hemisphere_Southern Hemisphere,diet_Healthy,diet_Unhealthy,blood_pressure_class_B,blood_pressure_class_C,blood_pressure_class_D
0,67,Male,208,158/88,72,0,0,1,0,0,...,False,False,False,True,True,False,False,False,False,True
1,21,Male,389,165/93,98,1,1,1,1,1,...,False,False,True,False,False,False,True,False,False,True
2,21,Female,324,174/99,72,1,0,0,0,0,...,False,True,False,False,False,True,False,False,False,True
3,84,Male,383,163/100,73,1,1,1,0,1,...,False,False,True,False,False,False,False,False,False,True
4,66,Male,318,91/88,93,1,1,1,1,0,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,60,Male,121,94/76,61,1,1,1,0,1,...,False,False,False,False,False,True,False,True,False,False
8759,28,Female,120,157/102,73,1,0,0,1,0,...,False,False,True,False,False,True,False,False,False,True
8760,47,Male,250,161/75,105,0,1,1,1,1,...,False,False,False,True,True,False,False,False,False,True
8761,36,Male,178,119/67,60,1,0,1,0,0,...,False,False,False,True,True,False,True,True,False,False


In [None]:
df_transformed=df_transformed.drop(columns=['blood_pressure','blood_pressure_class','sex','continent','hemisphere','diet'])
df_transformed.head(3)


In [17]:
# export new clean df with dummies csv
df_transformed.to_csv('df_dummies.csv', index = True)

# Balance dataset (random)

In [None]:
df_risk = df_transformed[df_transformed['heart_attack_risk']==1]
df_no_risk = df_transformed[df_transformed['heart_attack_risk']==0]

# df_fraud = df[df['is_fraudulent'] == 1]
# df_non_fraud = df[df['is_fraudulent'] == 0]

risk_sample = df_risk #.sample(n=3000, random_state=0)

non_risk_sample = df_no_risk.sample(n=3139, random_state=0)

In [None]:
# df_no_risk = df_no_risk[:3139]

In [None]:
df_2=pd.concat([risk_sample,non_risk_sample])

In [None]:
features = df_2.drop(columns=['heart_attack_risk','income'])
target = df_2['heart_attack_risk']

# train_test_split & normalization

In [None]:

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [None]:
normalizer = MinMaxScaler()
normalizer.fit(X_train)
X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [None]:
X_train_norm = pd.DataFrame(X_train_norm,columns=X_train.columns)
X_test_norm = pd.DataFrame(X_test_norm,columns=X_test.columns)

# Model selection

## Decision Tree

In [None]:
tree = DecisionTreeClassifier(max_depth=10,random_state=10)

In [None]:
tree.fit(X_train_norm, y_train)

In [None]:
pred_tree = tree.predict(X_test_norm)

In [None]:
confusion_matrix(y_test, pred_tree)

In [None]:
print(classification_report(y_test, pred_tree))

In [None]:
sns.heatmap(confusion_matrix(y_test, pred_tree), annot=True)

In [None]:
print("MAE", mean_absolute_error(pred_tree, y_test))
print("RMSE", mean_squared_error(pred_tree, y_test, squared=False))
print("R2 score", tree.score(X_test_norm, y_test))

In [None]:
from sklearn.tree import export_text

tree_viz = export_text(tree, feature_names=list(X_train_norm.columns))
print(tree_viz)

In [None]:
tree_importance = {feature : importance for feature, importance in zip(X_train_norm.columns, tree.feature_importances_)}
tree_importance 

In [None]:
# age,cholesterol,heart_rate,bmi,sedentary_hours_per_day,triglycerides,exercise_hours_per_week

In [None]:
features.columns

- select less features

In [None]:
features2 = features[['age','cholesterol','heart_rate','sedentary_hours_per_day','bmi','triglycerides','exercise_hours_per_week']]

In [None]:
target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features2, target, test_size=0.20, random_state=0)

In [None]:
normalizer = MinMaxScaler()
normalizer.fit(X_train)
X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)
X_train_norm = pd.DataFrame(X_train_norm,columns=X_train.columns)
X_test_norm = pd.DataFrame(X_test_norm,columns=X_test.columns)

In [None]:
tree = DecisionTreeClassifier(max_depth=10,random_state=10)

In [None]:
tree.fit(X_train_norm, y_train)

In [None]:
pred_tree = tree.predict(X_test_norm)

In [None]:
confusion_matrix(y_test, pred_tree)

In [None]:
print(classification_report(y_test, pred_tree))

In [None]:
print("MAE", mean_absolute_error(pred_tree, y_test))
print("RMSE", mean_squared_error(pred_tree, y_test, squared=False))
print("R2 score", tree.score(X_test_norm, y_test))

## AdaBoost

## Bagging and Pasting

## Random Forest

## Gradient Boosting

In [None]:
# Rename columns to predicted values - 0 = No diabetes, 1 = Diabetes
cm.rename({0: 'No - Pred', 1: 'Yes - Pred'}, axis=1, inplace=True)
# Rename rows to real values - 0 = No diabetes, 1 = Diabetes
cm.rename({0: 'No - True', 1: 'Yes - True'}, axis=0, inplace=True)
px.imshow(cm, text_auto=True, color_continuous_scale='RdBu', color_continuous_midpoint=0)