In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.utils import parallel_backend
import optuna
from imblearn.under_sampling import ClusterCentroids
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
ad = pd.read_csv("heart_attack_russia_youth_vs_adult.csv")
df = ad.copy()
df.head(5)

Unnamed: 0,ID,Age,Gender,Region,Blood_Pressure,Cholesterol,BMI,Heart_Rate,Exercise_Level,Smoking,...,Income_Level,Physical_Activity,Education_Level,Marital_Status,Urban_Rural,Medication,Health_Awareness,Daily_Water_Intake,Mental_Health,Obesity
0,1,50,Male,Rural,110.0,196.5,15.9,76,High,False,...,Low,Low,Primary,Married,Rural,False,5,2.3,5,False
1,2,40,Female,Urban,138.8,157.5,27.1,82,Moderate,False,...,Low,Moderate,Higher,Married,Urban,False,1,5.0,4,False
2,3,26,Male,Rural,116.0,210.1,27.2,71,Moderate,False,...,Middle,High,Primary,Married,Urban,False,4,2.4,8,False
3,4,54,Female,Rural,133.5,170.5,26.0,74,Moderate,True,...,Middle,Moderate,Higher,Married,Urban,False,2,2.7,6,True
4,5,19,Female,Urban,108.0,224.5,27.5,67,Low,False,...,Middle,Low,Higher,Widowed,Urban,False,4,3.5,4,True


In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,50000.0,25000.5,14433.901067,1.0,12500.75,25000.5,37500.25,50000.0
Age,50000.0,35.99182,14.110139,12.0,24.0,36.0,48.0,60.0
Blood_Pressure,50000.0,120.058636,14.975835,60.0,109.9,120.05,130.2,188.4
Cholesterol,50000.0,199.852762,49.998331,-18.7,166.1,199.9,233.5,398.8
BMI,50000.0,24.983912,5.003784,2.9,21.6,25.0,28.4,46.1
Heart_Rate,50000.0,79.98898,11.804567,60.0,70.0,80.0,90.0,100.0
Stress_Level,50000.0,5.5037,2.870741,1.0,3.0,6.0,8.0,10.0
Sleep_Hours,50000.0,7.011464,1.740651,4.0,5.5,7.0,8.5,10.0
Health_Awareness,50000.0,3.0071,1.410351,1.0,2.0,3.0,4.0,5.0
Daily_Water_Intake,50000.0,2.997782,1.149608,1.0,2.0,3.0,4.0,5.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     50000 non-null  int64  
 1   Age                    50000 non-null  int64  
 2   Gender                 50000 non-null  object 
 3   Region                 50000 non-null  object 
 4   Blood_Pressure         50000 non-null  float64
 5   Cholesterol            50000 non-null  float64
 6   BMI                    50000 non-null  float64
 7   Heart_Rate             50000 non-null  int64  
 8   Exercise_Level         50000 non-null  object 
 9   Smoking                50000 non-null  bool   
 10  Alcohol_Consumption    24976 non-null  object 
 11  Diabetes               50000 non-null  bool   
 12  Family_History         50000 non-null  bool   
 13  Stress_Level           50000 non-null  int64  
 14  Heart_Attack           50000 non-null  bool   
 15  An

In [5]:
for i in df.select_dtypes(include='object').columns:
    print(df[i].value_counts())
    print('****'*20)

Gender
Female    24155
Male      23944
Other      1901
Name: count, dtype: int64
********************************************************************************
Region
Urban       25034
Rural       14946
Suburban    10020
Name: count, dtype: int64
********************************************************************************
Exercise_Level
Moderate    24946
Low         15100
High         9954
Name: count, dtype: int64
********************************************************************************
Alcohol_Consumption
Moderate    20022
Heavy        4954
Name: count, dtype: int64
********************************************************************************
Diet
Healthy      19789
Mixed        15185
Unhealthy    15026
Name: count, dtype: int64
********************************************************************************
Occupation
Employed      25166
Student       10068
Unemployed     9843
Retired        4923
Name: count, dtype: int64
*********************************************

In [6]:
df = pd.get_dummies(df, columns=['Gender'])

In [7]:
df=df.drop(["Gender_Other"],axis=1)

In [8]:
df = pd.get_dummies(df, columns=['Region'])

In [9]:
df = pd.get_dummies(df, columns=['Exercise_Level'])

In [10]:
df=df.drop(["Alcohol_Consumption"],axis=1)

In [11]:
df = pd.get_dummies(df, columns=['Diet'])

In [12]:
df = pd.get_dummies(df, columns=['Occupation'])

In [13]:
df = pd.get_dummies(df, columns=['Income_Level'])

In [14]:
df = pd.get_dummies(df, columns=['Physical_Activity'])

In [15]:
df = pd.get_dummies(df, columns=['Education_Level'])

In [16]:
df = pd.get_dummies(df, columns=['Marital_Status'])

In [17]:
df = pd.get_dummies(df, columns=['Urban_Rural'])

In [18]:
df=df.drop(["Urban_Rural_Urban"],axis=1)

In [19]:
df=df.drop(["ID"],axis=1)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 47 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         50000 non-null  int64  
 1   Blood_Pressure              50000 non-null  float64
 2   Cholesterol                 50000 non-null  float64
 3   BMI                         50000 non-null  float64
 4   Heart_Rate                  50000 non-null  int64  
 5   Smoking                     50000 non-null  bool   
 6   Diabetes                    50000 non-null  bool   
 7   Family_History              50000 non-null  bool   
 8   Stress_Level                50000 non-null  int64  
 9   Heart_Attack                50000 non-null  bool   
 10  Angina                      50000 non-null  bool   
 11  Heart_Disease_History       50000 non-null  bool   
 12  Sleep_Hours                 50000 non-null  float64
 13  Medication                  500

In [21]:
corr_matrix=df.corr()
corr_matrix["Heart_Attack"].sort_values()

Blood_Pressure               -0.010560
Urban_Rural_Rural            -0.008209
Physical_Activity_Low        -0.006333
Medication                   -0.006285
Marital_Status_Married       -0.005854
Stress_Level                 -0.005845
Region_Urban                 -0.005028
Education_Level_Higher       -0.004850
Diabetes                     -0.004789
Income_Level_Low             -0.004730
Heart_Disease_History        -0.004472
Diet_Unhealthy               -0.004111
Health_Awareness             -0.003775
Occupation_Unemployed        -0.003706
Cholesterol                  -0.003397
Occupation_Retired           -0.002509
Heart_Rate                   -0.002241
Obesity                      -0.000750
Exercise_Level_Low           -0.000684
Mental_Health                -0.000619
BMI                          -0.000526
Education_Level_Secondary    -0.000497
Exercise_Level_Moderate       0.000230
Gender_Female                 0.000235
Family_History                0.000322
Diet_Healthy             

In [22]:
x = df.drop("Heart_Attack", axis=1)
y = df["Heart_Attack"]

In [23]:
cc = ClusterCentroids(random_state=42)
with parallel_backend('threading', n_jobs=-1):
    x_resampled, y_resampled = cc.fit_resample(x, y)

In [24]:
x_resampled.shape, y_resampled.shape

((11762, 46), (11762,))

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, random_state=42, test_size=0.2)
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((9409, 46), (2353, 46), (9409,), (2353,))

In [26]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [27]:
ann = tf.keras.models.Sequential()

In [28]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [29]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [30]:
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [31]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [32]:
ann.fit(x_train, y_train, batch_size = 32, epochs = 200)

Epoch 1/200
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8382 - loss: 0.4515
Epoch 2/200
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9692 - loss: 0.1074
Epoch 3/200
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9736 - loss: 0.0933
Epoch 4/200
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9788 - loss: 0.0822
Epoch 5/200
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9793 - loss: 0.0806
Epoch 6/200
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9799 - loss: 0.0797
Epoch 7/200
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9797 - loss: 0.0822
Epoch 8/200
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9824 - loss: 0.0709
Epoch 9/200
[1m295/295[0m [32

<keras.src.callbacks.history.History at 0x1e80c19d690>