In [9]:
# Import dependencies.
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
# Import data into a dataframe.
df = pd.read_csv('2022/heart_2022_no_nans.csv')

# Print the dataframe's length.
print(len(df))

# Display the dataframe.
df.head()

246022


Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [3]:
# Randomly sample the dataframe without replacement to reduce dataset size.
sample_size = 25000
sample_df = df.sample(n=sample_size, replace=False, random_state=1, ignore_index=True)

# Display the sampled dataframe.
sample_df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Washington,Female,Very good,0.0,5.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,No,...,1.7,113.4,39.16,No,Yes,Yes,No,"Yes, received Tdap",Yes,No
1,Florida,Male,Good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,8.0,None of them,No,...,1.88,99.79,28.25,No,Yes,No,No,"Yes, received tetanus shot but not sure what type",No,Yes
2,Utah,Female,Fair,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.65,86.18,31.62,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
3,South Carolina,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,None of them,No,...,1.8,81.65,25.1,Yes,No,No,Yes,"Yes, received tetanus shot but not sure what type",No,No
4,North Dakota,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,7.0,1 to 5,No,...,1.78,108.86,34.44,Yes,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No


In [6]:
# Choose relevant columns for model training.
reduced_df = sample_df[['Sex', 
                        'PhysicalHealthDays',
                        'MentalHealthDays',
                        'LastCheckupTime',
                        'PhysicalActivities',
                        'SleepHours',
                        'HadHeartAttack',
                        'RemovedTeeth',
                        'HadAngina',
                        'HadStroke',
                        'SmokerStatus',
                        'AgeCategory',
                        'HeightInMeters',
                        'WeightInKilograms',
                        'AlcoholDrinkers']]
reduced_df.head()

Unnamed: 0,Sex,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadHeartAttack,RemovedTeeth,HadAngina,HadStroke,SmokerStatus,AgeCategory,HeightInMeters,WeightInKilograms,AlcoholDrinkers
0,Female,0.0,5.0,Within past year (anytime less than 12 months ...,Yes,7.0,No,1 to 5,No,No,Never smoked,Age 35 to 39,1.7,113.4,No
1,Male,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,8.0,No,None of them,No,No,Former smoker,Age 25 to 29,1.88,99.79,No
2,Female,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,No,None of them,No,No,Never smoked,Age 25 to 29,1.65,86.18,No
3,Male,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,No,None of them,No,No,Never smoked,Age 70 to 74,1.8,81.65,Yes
4,Male,0.0,0.0,Within past year (anytime less than 12 months ...,No,7.0,No,1 to 5,No,No,Current smoker - now smokes every day,Age 50 to 54,1.78,108.86,Yes


In [11]:
# Scale numerical data.
scaled_data = StandardScaler().fit_transform(reduced_df[["PhysicalHealthDays",
                                                            "MentalHealthDays",
                                                            "SleepHours",
                                                            "HeightInMeters",
                                                            "WeightInKilograms"]])
scaled_df = pd.DataFrame(scaled_data, columns=["PhysicalHealthDays",
                                                "MentalHealthDays",
                                                "SleepHours",
                                                "HeightInMeters",
                                                "WeightInKilograms"])
scaled_df.head()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms
0,-0.48774,0.113974,-0.025174,-0.061718,1.404549
1,-0.48774,-0.508455,0.674879,1.619866,0.760563
2,-0.48774,-0.508455,-0.725227,-0.528825,0.116577
3,-0.48774,-0.508455,0.674879,0.872496,-0.097769
4,-0.48774,-0.508455,-0.025174,0.685653,1.189729


In [14]:
# Change categorical datas to binary using one-hot encoding.
encoded_df = pd.get_dummies(reduced_df[['Sex', 
                        'LastCheckupTime',
                        'PhysicalActivities',
                        'HadHeartAttack',
                        'RemovedTeeth',
                        'HadAngina',
                        'HadStroke',
                        'SmokerStatus',
                        'AgeCategory',
                        'AlcoholDrinkers']], drop_first=True)
encoded_df.head()

Unnamed: 0,Sex_Male,LastCheckupTime_Within past 2 years (1 year but less than 2 years ago),LastCheckupTime_Within past 5 years (2 years but less than 5 years ago),LastCheckupTime_Within past year (anytime less than 12 months ago),PhysicalActivities_Yes,HadHeartAttack_Yes,"RemovedTeeth_6 or more, but not all",RemovedTeeth_All,RemovedTeeth_None of them,HadAngina_Yes,...,AgeCategory_Age 40 to 44,AgeCategory_Age 45 to 49,AgeCategory_Age 50 to 54,AgeCategory_Age 55 to 59,AgeCategory_Age 60 to 64,AgeCategory_Age 65 to 69,AgeCategory_Age 70 to 74,AgeCategory_Age 75 to 79,AgeCategory_Age 80 or older,AlcoholDrinkers_Yes
0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
4,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [15]:
# Concatenate the data.
cleaned_df = pd.concat([scaled_df, encoded_df], axis=1)

cleaned_df.head()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,Sex_Male,LastCheckupTime_Within past 2 years (1 year but less than 2 years ago),LastCheckupTime_Within past 5 years (2 years but less than 5 years ago),LastCheckupTime_Within past year (anytime less than 12 months ago),PhysicalActivities_Yes,...,AgeCategory_Age 40 to 44,AgeCategory_Age 45 to 49,AgeCategory_Age 50 to 54,AgeCategory_Age 55 to 59,AgeCategory_Age 60 to 64,AgeCategory_Age 65 to 69,AgeCategory_Age 70 to 74,AgeCategory_Age 75 to 79,AgeCategory_Age 80 or older,AlcoholDrinkers_Yes
0,-0.48774,0.113974,-0.025174,-0.061718,1.404549,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,-0.48774,-0.508455,0.674879,1.619866,0.760563,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,-0.48774,-0.508455,-0.725227,-0.528825,0.116577,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,-0.48774,-0.508455,0.674879,0.872496,-0.097769,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
4,-0.48774,-0.508455,-0.025174,0.685653,1.189729,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1


In [16]:
# Export the preprocessed data as a csv.
cleaned_df.to_csv('processed_data.csv')