In [15]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
import pandas as pd
import numpy as np

In [2]:
# Load the dataset into a DataFrame
data = pd.read_csv('Resources/heart_attack_predictions.csv')
data.head(20)

Unnamed: 0,Country,Age,Gender,Cholesterol_Level,Blood_Pressure,Smoking_History,Alcohol_Consumption,Physical_Activity,Obesity,Diabetes,...,LDL_Cholesterol,Triglycerides,Heart_Disease_Risk,Medication_Adherence,Urbanization_Level,Air_Pollution_Exposure,Access_To_Healthcare,Education_Level,Income_Level,Heart_Attack_Outcome
0,Germany,39,Male,210.091036,173.30165,Never,0.531933,Active,No,No,...,68.333573,165.060897,Medium,No,Urban,51.988416,Good,Tertiary,Low,Died
1,Egypt,88,Male,163.99873,137.381678,Former,5.626668,Sedentary,Yes,Yes,...,98.194015,452.124651,Medium,No,Suburban,97.739896,Good,,Low,Died
2,Spain,60,Female,263.50259,139.737677,Former,0.928015,Active,No,Yes,...,118.767677,171.43262,Low,No,Urban,60.352793,Poor,Tertiary,Low,Survived
3,Canada,25,Female,292.003927,85.992807,Former,7.374519,Active,No,Yes,...,190.190597,329.553258,Medium,No,Urban,14.397347,Good,Secondary,High,Survived
4,France,54,Female,267.736563,119.882856,Former,4.317845,Sedentary,Yes,No,...,188.209372,449.953074,High,Yes,Rural,69.448684,Good,,Low,Died
5,Pakistan,89,Male,192.824536,84.614988,Current,4.275495,Active,Yes,Yes,...,91.718371,476.605123,High,Yes,Suburban,23.806738,Good,Primary,Medium,Died
6,Germany,37,Female,248.19281,119.187008,Current,9.786531,Moderate,No,No,...,111.451727,471.97238,High,No,Urban,81.992726,Poor,Secondary,Medium,Died
7,Nigeria,67,Female,190.648952,88.151404,Current,0.282693,Active,Yes,Yes,...,134.535071,438.827926,Low,No,Urban,57.804529,Good,Primary,Medium,Died
8,Vietnam,36,Female,167.748324,174.399328,Never,3.382476,Active,Yes,No,...,136.219095,126.158745,Low,Yes,Suburban,20.029875,Poor,Secondary,Medium,Died
9,Canada,67,Female,170.507186,164.832277,Former,6.146588,Moderate,No,Yes,...,129.812473,376.639056,Low,Yes,Urban,95.596014,Poor,Tertiary,High,Survived


In [3]:
# Check for missing values. 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623027 entries, 0 to 623026
Data columns (total 30 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Country                  623027 non-null  object 
 1   Age                      623027 non-null  int64  
 2   Gender                   623027 non-null  object 
 3   Cholesterol_Level        623027 non-null  float64
 4   Blood_Pressure           623027 non-null  float64
 5   Smoking_History          623027 non-null  object 
 6   Alcohol_Consumption      623027 non-null  float64
 7   Physical_Activity        623027 non-null  object 
 8   Obesity                  623027 non-null  object 
 9   Diabetes                 623027 non-null  object 
 10  Family_History           623027 non-null  object 
 11  Stress_Levels            623027 non-null  float64
 12  Dietary_Habits           623027 non-null  object 
 13  Heart_Attack_History     623027 non-null  object 
 14  Ches

In [4]:
data.columns

Index(['Country', 'Age', 'Gender', 'Cholesterol_Level', 'Blood_Pressure',
       'Smoking_History', 'Alcohol_Consumption', 'Physical_Activity',
       'Obesity', 'Diabetes', 'Family_History', 'Stress_Levels',
       'Dietary_Habits', 'Heart_Attack_History', 'Chest_Pain',
       'Exercise_Induced_Angina', 'Resting_ECG', 'Max_Heart_Rate_Achieved',
       'Thalassemia', 'HDL_Cholesterol', 'LDL_Cholesterol', 'Triglycerides',
       'Heart_Disease_Risk', 'Medication_Adherence', 'Urbanization_Level',
       'Air_Pollution_Exposure', 'Access_To_Healthcare', 'Education_Level',
       'Income_Level', 'Heart_Attack_Outcome'],
      dtype='object')

In [5]:
data.dtypes

Country                     object
Age                          int64
Gender                      object
Cholesterol_Level          float64
Blood_Pressure             float64
Smoking_History             object
Alcohol_Consumption        float64
Physical_Activity           object
Obesity                     object
Diabetes                    object
Family_History              object
Stress_Levels              float64
Dietary_Habits              object
Heart_Attack_History        object
Chest_Pain                  object
Exercise_Induced_Angina     object
Resting_ECG                 object
Max_Heart_Rate_Achieved      int64
Thalassemia                 object
HDL_Cholesterol            float64
LDL_Cholesterol            float64
Triglycerides              float64
Heart_Disease_Risk          object
Medication_Adherence        object
Urbanization_Level          object
Air_Pollution_Exposure     float64
Access_To_Healthcare        object
Education_Level             object
Income_Level        

In [6]:
# Identify object columns
object_columns = data.select_dtypes(include=['object']).columns.tolist()
print("Object columns:", object_columns)

Object columns: ['Country', 'Gender', 'Smoking_History', 'Physical_Activity', 'Obesity', 'Diabetes', 'Family_History', 'Dietary_Habits', 'Heart_Attack_History', 'Chest_Pain', 'Exercise_Induced_Angina', 'Resting_ECG', 'Thalassemia', 'Heart_Disease_Risk', 'Medication_Adherence', 'Urbanization_Level', 'Access_To_Healthcare', 'Education_Level', 'Income_Level', 'Heart_Attack_Outcome']


In [7]:
Nominal_data = ['Country', 'Thalassemia']
Binary_Data = ['Gender', 'Obesity', 'Diabetes', 'Family_History', 'Heart_Attack_History', 'Chest_Pain','Exercise_Induced_Angina', 'Resting_ECG', 'Medication_Adherence']
# Resting_ECG abnormal and normal rather than yes and no
Ordinal_Data = ['Smoking_History', 'Physical_Activity', 'Dietary_Habits', 'Heart_Disease_Risk', 'Urbanization_Level','Access_To_Healthcare', 'Education_Level', 'Income_Level' ]

In [8]:
from sklearn.preprocessing import OrdinalEncoder
smoking_encoder = OrdinalEncoder(categories=[['Current', 'Former', 'Never']])
activity_encoder = OrdinalEncoder(categories=[['Sedentary', 'Moderate', 'Active']])
dietary_encoder = OrdinalEncoder(categories=[['Unhealthy', 'Moderate', 'Healthy']])
heart_disease_risk_encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
urbanization_encoder = OrdinalEncoder(categories=[['Urban', 'Suburban', 'Rural']])
access_to_healthcare_encoder = OrdinalEncoder(categories=[['Poor', 'Average', 'Good']])
education_encoder = OrdinalEncoder(categories=[['nan', 'Primary', 'Secondary', 'Tertiary']])
income_encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])

In [29]:
data['Thalassemia'].unique()

array(['Reversible Defect', 'Normal', 'Fixed Defect'], dtype=object)

In [30]:
#Create encoders for Nominal_Data:
country_ohe = OneHotEncoder(categories=[['Germany', 'Egypt', 'Spain', 'Canada', 'France', 'Pakistan',
       'Nigeria', 'Vietnam', 'Philippines', 'Brazil', 'China',
       'Bangladesh', 'India', 'South Africa', 'United Kingdom',
       'United States', 'South Korea', 'Turkey', 'Australia', 'Indonesia',
       'Russia', 'Japan', 'Saudi Arabia', 'Italy', 'Mexico']])
thalassemia_ohe = OneHotEncoder(categories=[['Reversible Defect', 'Normal', 'Fixed Defect']])

In [27]:
# Create encoders for Binary_Data:
gender_ohe = OneHotEncoder(categories=[['Male', 'Female']])
obesity_ohe = OneHotEncoder(categories=[['No', 'Yes']])
diabetes_ohe = OneHotEncoder(categories=[['No', 'Yes']])
family_history_ohe = OneHotEncoder(categories=[['No', 'Yes']])
heart_attack_history_ohe = OneHotEncoder(categories=[['No', 'Yes']])
chest_pain_ohe = OneHotEncoder(categories=[['No', 'Yes']])
exercise_induced_angina_ohe = OneHotEncoder(categories=[['No', 'Yes']])
resting_ecg_ohe = OneHotEncoder(categories=[['Abnormal', 'Normal']])
medication_adherence_ohe = OneHotEncoder(categories=[['No', 'Yes']])

In [17]:
# Create an OrdinalEncoder for each ordinal column
#ordinal_encoders = {
 #   'Smoking_History': OrdinalEncoder(categories=[['Current', 'Former', 'Never']]),
  #  'Physical_Activity': OrdinalEncoder(categories=[['Sedentary', 'Moderate', 'Active']]),
   # 'Dietary_Habits': OrdinalEncoder(categories=[['Unhealthy', 'Moderate', 'Healthy']]),
    #'Heart_Disease_Risk': OrdinalEncoder(categories=[['Low', 'Medium', 'High']]),
#    'Urbanization_Level': OrdinalEncoder(categories=[['Urban', 'Suburban', 'Rural']]),
 #   'Access_To_Healthcare': OrdinalEncoder(categories=[['Poor', 'Average', 'Good']]),
  #  'Education_Level': OrdinalEncoder(categories=[['NaN', 'Primary', 'Secondary', 'Tertiary']]),
   # 'Income_Level': OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
#}

In [18]:
# Apply the encoders to the ordinal columns
#for column, encoder in ordinal_encoders.items():
 #   data[column] = encoder.fit_transform(data[[column]])

ValueError: could not convert string to float: 'Current'

In [9]:
# Scale numerical columns
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [14]:
#Split the data into train & test sets:
y = data['Heart_Attack_Outcome']  

X = data.drop(columns='Heart_Attack_Outcome')

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
heart_data = pd.read_csv('Resources/heart.csv')

In [11]:
heart_data.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [13]:
heart_data.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object