# Heart Attack Prediction in Indonesia

In [15]:
# Import necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, r2_score, confusion_matrix
from summarytools import dfSummary
from sklearn.impute import KNNImputer, SimpleImputer


# To suppress the warnings
import warnings
warnings.filterwarnings('ignore')

# model
from sklearn.linear_model import LogisticRegression


In [16]:
# import data
df = pd.read_csv(r"D:\Users\Data_set\heart_attack_prediction_indonesia.csv")


In [17]:
print("Data Shape:", df.shape)
print('\nFirst 5 rows:','\n',df.head())
print('\nDescriptive Statistics:','\n', df.describe)

Data Shape: (158355, 28)

First 5 rows: 
    age  gender region income_level  hypertension  diabetes  cholesterol_level  \
0   60    Male  Rural       Middle             0         1                211   
1   53  Female  Urban          Low             0         0                208   
2   62  Female  Urban          Low             0         0                231   
3   73    Male  Urban          Low             1         0                202   
4   52    Male  Urban       Middle             1         0                232   

   obesity  waist_circumference  family_history  ... blood_pressure_diastolic  \
0        0                   83               0  ...                       62   
1        0                  106               1  ...                       76   
2        1                  112               1  ...                       74   
3        0                   82               1  ...                       65   
4        0                   89               0  ...              

In [18]:
# df summary
dfSummary(df)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,age [int64],Mean (sd) : 54.5 (11.9) min < med < max: 25.0 < 55.0 < 90.0 IQR (CV) : 17.0 (4.6),66 distinct values,,0 (0.0%)
2,gender [object],1. Male 2. Female,"82,243 (51.9%) 76,112 (48.1%)",,0 (0.0%)
3,region [object],1. Urban 2. Rural,"103,038 (65.1%) 55,317 (34.9%)",,0 (0.0%)
4,income_level [object],1. Middle 2. Low 3. High,"71,230 (45.0%) 63,422 (40.1%) 23,703 (15.0%)",,0 (0.0%)
5,hypertension [int64],Mean (sd) : 0.3 (0.5) min < med < max: 0.0 < 0.0 < 1.0 IQR (CV) : 1.0 (0.7),2 distinct values,,0 (0.0%)
6,diabetes [int64],Mean (sd) : 0.2 (0.4) min < med < max: 0.0 < 0.0 < 1.0 IQR (CV) : 0.0 (0.5),2 distinct values,,0 (0.0%)
7,cholesterol_level [int64],Mean (sd) : 199.5 (39.7) min < med < max: 100.0 < 199.0 < 350.0 IQR (CV) : 54.0 (5.0),247 distinct values,,0 (0.0%)
8,obesity [int64],Mean (sd) : 0.2 (0.4) min < med < max: 0.0 < 0.0 < 1.0 IQR (CV) : 0.0 (0.6),2 distinct values,,0 (0.0%)
9,waist_circumference [int64],Mean (sd) : 93.3 (16.4) min < med < max: 20.0 < 93.0 < 173.0 IQR (CV) : 22.0 (5.7),136 distinct values,,0 (0.0%)
10,family_history [int64],Mean (sd) : 0.3 (0.5) min < med < max: 0.0 < 0.0 < 1.0 IQR (CV) : 1.0 (0.7),2 distinct values,,0 (0.0%)


In [19]:
# Finding Missing value
print("Missing Value:",'\n',df.isnull().sum())

Missing Value: 
 age                                   0
gender                                0
region                                0
income_level                          0
hypertension                          0
diabetes                              0
cholesterol_level                     0
obesity                               0
waist_circumference                   0
family_history                        0
smoking_status                        0
alcohol_consumption               94848
physical_activity                     0
dietary_habits                        0
air_pollution_exposure                0
stress_level                          0
sleep_hours                           0
blood_pressure_systolic               0
blood_pressure_diastolic              0
fasting_blood_sugar                   0
cholesterol_hdl                       0
cholesterol_ldl                       0
triglycerides                         0
EKG_results                           0
previous_heart_disease 

In [20]:
# Alcohol_consumption column has 63507 out of 158355
df.shape[0] - df['alcohol_consumption'].isnull().sum()

# If we use dropna or simple imputer strategy might indeed lead to biased or incorrect predictions.
# Let's try with Advanced Imputation Techniques 

63507

# EDA

In [21]:
df.head()

Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening,heart_attack
0,60,Male,Rural,Middle,0,1,211,0,83,0,...,62,173,48,121,101,Normal,0,0,0,0
1,53,Female,Urban,Low,0,0,208,0,106,1,...,76,70,58,83,138,Normal,1,0,1,0
2,62,Female,Urban,Low,0,0,231,1,112,1,...,74,118,69,130,171,Abnormal,0,1,0,1
3,73,Male,Urban,Low,1,0,202,0,82,1,...,65,98,52,85,146,Normal,0,1,1,0
4,52,Male,Urban,Middle,1,0,232,0,89,0,...,75,104,59,127,139,Normal,1,0,1,1


In [22]:
# Target value distribution
print(df['heart_attack'].value_counts())
print('')
print(df['heart_attack'].value_counts(normalize=True))

heart_attack
0    94854
1    63501
Name: count, dtype: int64

heart_attack
0    0.598996
1    0.401004
Name: proportion, dtype: float64


In [26]:
x = df.drop(columns=['heart_attack'])
y = df['heart_attack']

Catagorical_features = x.select_dtypes(include=['object']).columns
Numerical_features = x.select_dtypes(include=['int64','float64']).columns

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,)

numerical_transformer = Pipeline(steps=[
    ('KnnImpute', KNNImputer(n_neighbors=3)),
    ('Scaler', StandardScaler())
])

catagorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

preprocesser = ColumnTransformer(transformers=[
    ('num',numerical_transformer,Numerical_features ),
    ('cat',catagorical_transformer,Catagorical_features)
])


In [30]:
model = Pipeline(steps=[
    ('preprocesser',preprocesser),
    ('classifier',LogisticRegression())
])
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test,y_pred)
print(f'Accuracy score: {accuracy:.2f}')

Accuracy score: 0.73


In [None]:
# Let's check the model 

In [35]:
sample_data = df.sample(1)
sample_data
# Sample data patient has risk of  heart_attack

Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening,heart_attack
131926,52,Male,Rural,Middle,0,0,165,1,118,0,...,73,125,53,167,123,Normal,1,1,1,1


In [36]:
# Sample data without target
sample_data_without_target = sample_data.drop(columns=['heart_attack'])
sample_data_without_target

Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_systolic,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening
131926,52,Male,Rural,Middle,0,0,165,1,118,0,...,105,73,125,53,167,123,Normal,1,1,1


In [37]:
# I got the correct result......!!!!

prediction = model.predict(sample_data_without_target)

if prediction[0] == 1:
      print("Prediction: The person is at risk of a heart attack.")
else:
    print("Prediction: The person is not at risk of a heart attack.")
    


Prediction: The person is at risk of a heart attack.
