In [20]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
import pandas as pd
import numpy as np

In [10]:
# Load the dataset into a DataFrame
data = pd.read_csv('Resources/heart_attack_predictions.csv')
data.head()

Unnamed: 0,Country,Age,Gender,Cholesterol_Level,Blood_Pressure,Smoking_History,Alcohol_Consumption,Physical_Activity,Obesity,Diabetes,...,LDL_Cholesterol,Triglycerides,Heart_Disease_Risk,Medication_Adherence,Urbanization_Level,Air_Pollution_Exposure,Access_To_Healthcare,Education_Level,Income_Level,Heart_Attack_Outcome
0,Germany,39,Male,210.091036,173.30165,Never,0.531933,Active,No,No,...,68.333573,165.060897,Medium,No,Urban,51.988416,Good,Tertiary,Low,Died
1,Egypt,88,Male,163.99873,137.381678,Former,5.626668,Sedentary,Yes,Yes,...,98.194015,452.124651,Medium,No,Suburban,97.739896,Good,,Low,Died
2,Spain,60,Female,263.50259,139.737677,Former,0.928015,Active,No,Yes,...,118.767677,171.43262,Low,No,Urban,60.352793,Poor,Tertiary,Low,Survived
3,Canada,25,Female,292.003927,85.992807,Former,7.374519,Active,No,Yes,...,190.190597,329.553258,Medium,No,Urban,14.397347,Good,Secondary,High,Survived
4,France,54,Female,267.736563,119.882856,Former,4.317845,Sedentary,Yes,No,...,188.209372,449.953074,High,Yes,Rural,69.448684,Good,,Low,Died


In [11]:
data.columns

Index(['Country', 'Age', 'Gender', 'Cholesterol_Level', 'Blood_Pressure',
       'Smoking_History', 'Alcohol_Consumption', 'Physical_Activity',
       'Obesity', 'Diabetes', 'Family_History', 'Stress_Levels',
       'Dietary_Habits', 'Heart_Attack_History', 'Chest_Pain',
       'Exercise_Induced_Angina', 'Resting_ECG', 'Max_Heart_Rate_Achieved',
       'Thalassemia', 'HDL_Cholesterol', 'LDL_Cholesterol', 'Triglycerides',
       'Heart_Disease_Risk', 'Medication_Adherence', 'Urbanization_Level',
       'Air_Pollution_Exposure', 'Access_To_Healthcare', 'Education_Level',
       'Income_Level', 'Heart_Attack_Outcome'],
      dtype='object')

In [12]:
data.dtypes

Country                     object
Age                          int64
Gender                      object
Cholesterol_Level          float64
Blood_Pressure             float64
Smoking_History             object
Alcohol_Consumption        float64
Physical_Activity           object
Obesity                     object
Diabetes                    object
Family_History              object
Stress_Levels              float64
Dietary_Habits              object
Heart_Attack_History        object
Chest_Pain                  object
Exercise_Induced_Angina     object
Resting_ECG                 object
Max_Heart_Rate_Achieved      int64
Thalassemia                 object
HDL_Cholesterol            float64
LDL_Cholesterol            float64
Triglycerides              float64
Heart_Disease_Risk          object
Medication_Adherence        object
Urbanization_Level          object
Air_Pollution_Exposure     float64
Access_To_Healthcare        object
Education_Level             object
Income_Level        

In [13]:
# Identify all object columns
object_columns = data.select_dtypes(include=['object']).columns.tolist()
print(f"Object columns: {object_columns}")

Object columns: ['Country', 'Gender', 'Smoking_History', 'Physical_Activity', 'Obesity', 'Diabetes', 'Family_History', 'Dietary_Habits', 'Heart_Attack_History', 'Chest_Pain', 'Exercise_Induced_Angina', 'Resting_ECG', 'Thalassemia', 'Heart_Disease_Risk', 'Medication_Adherence', 'Urbanization_Level', 'Access_To_Healthcare', 'Education_Level', 'Income_Level', 'Heart_Attack_Outcome']


In [14]:
Nominal_data = ['Country', 'Thalassemia']
Binary_Data = ['Gender', 'Obesity', 'Diabetes', 'Family_History', 'Heart_Attack_History', 'Chest_Pain','Exercise_Induced_Angina', 'Resting_ECG', 'Medication_Adherence']
# Resting_ECG abnormal and normal rather than yes and no
Ordinal_Data = ['Smoking_History', 'Physical_Activity', 'Dietary_Habits', 'Heart_Disease_Risk', 'Urbanization_Level','Access_To_Healthcare', 'Education_Level', 'Income_Level' ]

In [16]:
#Create Ordinal Encoders:
smoking_encoder = OrdinalEncoder(categories=[['Current', 'Former', 'Never']])
activity_encoder = OrdinalEncoder(categories=[['Sedentary', 'Moderate', 'Active']])
dietary_encoder = OrdinalEncoder(categories=[['Unhealthy', 'Moderate', 'Healthy']])
heart_disease_risk_encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
urbanization_encoder = OrdinalEncoder(categories=[['Urban', 'Suburban', 'Rural']])
access_to_healthcare_encoder = OrdinalEncoder(categories=[['Poor', 'Average', 'Good']])
education_encoder = OrdinalEncoder(categories=[['NaN', 'Primary', 'Secondary', 'Tertiary']])
income_encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])

In [17]:
# Apply Ordinal Encoding to Ordinal Data
data['Smoking_History'] = smoking_encoder.fit_transform(data[['Smoking_History']])
data['Physical_Activity'] = activity_encoder.fit_transform(data[['Physical_Activity']])
data['Dietary_Habits'] = dietary_encoder.fit_transform(data[['Dietary_Habits']])
data['Heart_Disease_Risk'] = heart_disease_risk_encoder.fit_transform(data[['Heart_Disease_Risk']])
data['Urbanization_Level'] = urbanization_encoder.fit_transform(data[['Urbanization_Level']])
data['Access_To_Healthcare'] = access_to_healthcare_encoder.fit_transform(data[['Access_To_Healthcare']])
data['Education_Level'] = education_encoder.fit_transform(data[['Education_Level']])
data['Income_Level'] = income_encoder.fit_transform(data[['Income_Level']])

ValueError: could not convert string to float: 'Current'

In [18]:
#One-hot encode the Nominal_data:
data = pd.get_dummies(data, columns=['Country', 'Thalassemia'], drop_first=True)

In [21]:
# Label encode Binary_Data:
label_encoder = LabelEncoder()
binary_columns = ['Gender', 'Obesity', 'Diabetes', 'Family_History', 'Heart_Attack_History', 'Chest_Pain', 
                  'Exercise_Induced_Angina', 'Resting_ECG', 'Medication_Adherence']

for col in binary_columns:
    data[col] = label_encoder.fit_transform(data[col])

In [22]:
# Scale numerical columns
# List of numerical columns 
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()

# Apply scaling
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [23]:
#Split the data into train & test sets:
y = data['Heart_Attack_Outcome']  

X = data.drop(columns='Heart_Attack_Outcome')

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)