In [1]:
# Import Dependencies
import pandas as pd
from path import Path

In [2]:
# Read the Data and Store it into a Pandas DataFrame
file_path = Path("Resources/heart_2020_cleaned.csv")
heart_df = pd.read_csv(file_path)
heart_df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3,30,No,Female,55-59,White,Yes,Yes,Very good,5,Yes,No,Yes
1,No,20.34,No,No,Yes,0,0,No,Female,80 or older,White,No,Yes,Very good,7,No,No,No
2,No,26.58,Yes,No,No,20,30,No,Male,65-69,White,Yes,Yes,Fair,8,Yes,No,No
3,No,24.21,No,No,No,0,0,No,Female,75-79,White,No,No,Good,6,No,No,Yes
4,No,23.71,No,No,No,28,0,Yes,Female,40-44,White,No,Yes,Very good,8,No,No,No


In [3]:
# Rename Index Column as Respondent_ID
heart_df.index.name = 'Respondent_ID'
heart_df.head()

Unnamed: 0_level_0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
Respondent_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,No,16.6,Yes,No,No,3,30,No,Female,55-59,White,Yes,Yes,Very good,5,Yes,No,Yes
1,No,20.34,No,No,Yes,0,0,No,Female,80 or older,White,No,Yes,Very good,7,No,No,No
2,No,26.58,Yes,No,No,20,30,No,Male,65-69,White,Yes,Yes,Fair,8,Yes,No,No
3,No,24.21,No,No,No,0,0,No,Female,75-79,White,No,No,Good,6,No,No,Yes
4,No,23.71,No,No,No,28,0,Yes,Female,40-44,White,No,Yes,Very good,8,No,No,No


In [4]:
# Set index to a Column
heart_df.reset_index(inplace = True)
heart_df.head()

Unnamed: 0,Respondent_ID,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,No,16.6,Yes,No,No,3,30,No,Female,55-59,White,Yes,Yes,Very good,5,Yes,No,Yes
1,1,No,20.34,No,No,Yes,0,0,No,Female,80 or older,White,No,Yes,Very good,7,No,No,No
2,2,No,26.58,Yes,No,No,20,30,No,Male,65-69,White,Yes,Yes,Fair,8,Yes,No,No
3,3,No,24.21,No,No,No,0,0,No,Female,75-79,White,No,No,Good,6,No,No,Yes
4,4,No,23.71,No,No,No,28,0,Yes,Female,40-44,White,No,Yes,Very good,8,No,No,No


In [5]:
# Create the Demographics Table
demographics_df = heart_df[['Respondent_ID', 'HeartDisease', 'Sex', 'AgeCategory', 'Race']].copy()
demographics_df.head()

Unnamed: 0,Respondent_ID,HeartDisease,Sex,AgeCategory,Race
0,0,No,Female,55-59,White
1,1,No,Female,80 or older,White
2,2,No,Male,65-69,White
3,3,No,Female,75-79,White
4,4,No,Female,40-44,White


In [7]:
# Rename the Columns for the Demographics Table
demographics_df = demographics_df.rename(columns = {'AgeCategory': 'AgeRange', 'Respondent_ID': 'RespondentID'})
demographics_df.head()

Unnamed: 0,RespondentID,HeartDisease,Sex,AgeRange,Race
0,0,No,Female,55-59,White
1,1,No,Female,80 or older,White
2,2,No,Male,65-69,White
3,3,No,Female,75-79,White
4,4,No,Female,40-44,White


In [8]:
# Drop the HeartDisease Column from the Demographics Table
demographics_df = demographics_df.drop(columns=['HeartDisease'])
demographics_df.head()

Unnamed: 0,RespondentID,Sex,AgeRange,Race
0,0,Female,55-59,White
1,1,Female,80 or older,White
2,2,Male,65-69,White
3,3,Female,75-79,White
4,4,Female,40-44,White


In [9]:
# Check the data types for the Demographics table
demographics_df.dtypes

RespondentID     int64
Sex             object
AgeRange        object
Race            object
dtype: object

In [10]:
# Create the Health Metrics Table
health_metrics_df = heart_df[['Respondent_ID', 'HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'PhysicalHealth', 'MentalHealth', 'PhysicalActivity', 'GenHealth', 'SleepTime']]
health_metrics_df.head()

Unnamed: 0,Respondent_ID,HeartDisease,BMI,Smoking,AlcoholDrinking,PhysicalHealth,MentalHealth,PhysicalActivity,GenHealth,SleepTime
0,0,No,16.6,Yes,No,3,30,Yes,Very good,5
1,1,No,20.34,No,No,0,0,Yes,Very good,7
2,2,No,26.58,Yes,No,20,30,Yes,Fair,8
3,3,No,24.21,No,No,0,0,No,Good,6
4,4,No,23.71,No,No,28,0,Yes,Very good,8


In [12]:
# Rename the Columns for the Health Metrics Table
health_metrics_df = health_metrics_df.rename(columns= {'Respondent_ID': 'RespondentID',
                                                        'PhysicalHealth': 'HealthyPhysicalDays',
                                                        'MentalHealth': 'HealthyMentalDays',
                                                        'GenHealth': 'GeneralHealth',
                                                        'SleepTime': 'SleepHours'})
health_metrics_df.head()

Unnamed: 0,RespondentID,HeartDisease,BMI,Smoking,AlcoholDrinking,HealthyPhysicalDays,HealthyMentalDays,PhysicalActivity,GeneralHealth,SleepHours
0,0,No,16.6,Yes,No,3,30,Yes,Very good,5
1,1,No,20.34,No,No,0,0,Yes,Very good,7
2,2,No,26.58,Yes,No,20,30,Yes,Fair,8
3,3,No,24.21,No,No,0,0,No,Good,6
4,4,No,23.71,No,No,28,0,Yes,Very good,8


In [13]:
# Drop the HeartDisease Column from the Health Metrics Table
health_metrics_df = health_metrics_df.drop(columns=['HeartDisease'])
health_metrics_df.head()

Unnamed: 0,RespondentID,BMI,Smoking,AlcoholDrinking,HealthyPhysicalDays,HealthyMentalDays,PhysicalActivity,GeneralHealth,SleepHours
0,0,16.6,Yes,No,3,30,Yes,Very good,5
1,1,20.34,No,No,0,0,Yes,Very good,7
2,2,26.58,Yes,No,20,30,Yes,Fair,8
3,3,24.21,No,No,0,0,No,Good,6
4,4,23.71,No,No,28,0,Yes,Very good,8


In [14]:
# Veiw the data types from the Health metrics table
health_metrics_df.dtypes

RespondentID             int64
BMI                    float64
Smoking                 object
AlcoholDrinking         object
HealthyPhysicalDays      int64
HealthyMentalDays        int64
PhysicalActivity        object
GeneralHealth           object
SleepHours               int64
dtype: object

In [15]:
# Change the data types to best fit the data in the Health metrics table
conversion_dict = {'Yes': True, 'No': False}

health_metrics_df["Smoking"] = health_metrics_df["Smoking"].map(conversion_dict)
health_metrics_df["AlcoholDrinking"] = health_metrics_df["AlcoholDrinking"].map(conversion_dict)
health_metrics_df["PhysicalActivity"] = health_metrics_df["PhysicalActivity"].map(conversion_dict)

health_metrics_df.head(25)

Unnamed: 0,RespondentID,BMI,Smoking,AlcoholDrinking,HealthyPhysicalDays,HealthyMentalDays,PhysicalActivity,GeneralHealth,SleepHours
0,0,16.6,True,False,3,30,True,Very good,5
1,1,20.34,False,False,0,0,True,Very good,7
2,2,26.58,True,False,20,30,True,Fair,8
3,3,24.21,False,False,0,0,False,Good,6
4,4,23.71,False,False,28,0,True,Very good,8
5,5,28.87,True,False,6,0,False,Fair,12
6,6,21.63,False,False,15,0,True,Fair,4
7,7,31.64,True,False,5,0,False,Good,9
8,8,26.45,False,False,0,0,False,Fair,5
9,9,40.69,False,False,0,0,True,Good,10


In [16]:
# Check the data types from the health metrics table
health_metrics_df.dtypes

RespondentID             int64
BMI                    float64
Smoking                   bool
AlcoholDrinking           bool
HealthyPhysicalDays      int64
HealthyMentalDays        int64
PhysicalActivity          bool
GeneralHealth           object
SleepHours               int64
dtype: object

In [17]:
# Create the Morbidity Table
morbidity_df = heart_df[['Respondent_ID', 'HeartDisease', 'Stroke', 'DiffWalking', 'Diabetic', 'Asthma', 'KidneyDisease', 'SkinCancer']]
morbidity_df.head()

Unnamed: 0,Respondent_ID,HeartDisease,Stroke,DiffWalking,Diabetic,Asthma,KidneyDisease,SkinCancer
0,0,No,No,No,Yes,Yes,No,Yes
1,1,No,Yes,No,No,No,No,No
2,2,No,No,No,Yes,Yes,No,No
3,3,No,No,No,No,No,No,Yes
4,4,No,No,Yes,No,No,No,No


In [19]:
# Rename the columns for the Morbidity Table
morbidity_df = morbidity_df.rename(columns = {'Respondent_ID': 'RespondentID',
                                             'DiffWalking': 'DifficultyWalking',
                                             'Diabetic': 'Diabetes'})
morbidity_df.head()

Unnamed: 0,RespondentID,HeartDisease,Stroke,DifficultyWalking,Diabetes,Asthma,KidneyDisease,SkinCancer
0,0,No,No,No,Yes,Yes,No,Yes
1,1,No,Yes,No,No,No,No,No
2,2,No,No,No,Yes,Yes,No,No
3,3,No,No,No,No,No,No,Yes
4,4,No,No,Yes,No,No,No,No


In [20]:
# View the data types for the morbidity table
morbidity_df.dtypes

RespondentID          int64
HeartDisease         object
Stroke               object
DifficultyWalking    object
Diabetes             object
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

In [21]:
# Change the data types to best fit the data in the morbidity table
conversion_dict = {'Yes': True, 'No': False}

morbidity_df["HeartDisease"] = morbidity_df["HeartDisease"].map(conversion_dict)
morbidity_df["Stroke"] = morbidity_df["Stroke"].map(conversion_dict)
morbidity_df["DifficultyWalking"] = morbidity_df["DifficultyWalking"].map(conversion_dict)
morbidity_df["Asthma"] = morbidity_df["Asthma"].map(conversion_dict)
morbidity_df["KidneyDisease"] = morbidity_df["KidneyDisease"].map(conversion_dict)
morbidity_df["SkinCancer"] = morbidity_df["SkinCancer"].map(conversion_dict)

morbidity_df.head(25)

Unnamed: 0,RespondentID,HeartDisease,Stroke,DifficultyWalking,Diabetes,Asthma,KidneyDisease,SkinCancer
0,0,False,False,False,Yes,True,False,True
1,1,False,True,False,No,False,False,False
2,2,False,False,False,Yes,True,False,False
3,3,False,False,False,No,False,False,True
4,4,False,False,True,No,False,False,False
5,5,True,False,True,No,False,False,False
6,6,False,False,False,No,True,False,True
7,7,False,False,True,Yes,True,False,False
8,8,False,False,False,"No, borderline diabetes",False,True,False
9,9,False,False,True,No,False,False,False


In [22]:
# Check the data types for the morbidity table
morbidity_df.dtypes


RespondentID          int64
HeartDisease           bool
Stroke                 bool
DifficultyWalking      bool
Diabetes             object
Asthma                 bool
KidneyDisease          bool
SkinCancer             bool
dtype: object

In [23]:
# Load the full updated heart disease dataset
file_path = Path("Resources/Heart_clean.csv")
final_heart_df = pd.read_csv(file_path)
final_heart_df.head()

Unnamed: 0.1,Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease_No,HeartDisease_Yes,Sex_Female,Sex_Male,AgeCategory_18-24,...,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_No,Asthma_Yes,KidneyDisease_No,KidneyDisease_Yes,SkinCancer_No,SkinCancer_Yes
0,59737,34.96,3,0,8,0,1,0,1,0,...,1,0,0,0,1,0,1,0,1,0
1,277485,25.1,0,0,9,0,1,0,1,0,...,0,1,0,0,1,0,1,0,0,1
2,58188,24.33,0,0,8,0,1,0,1,0,...,0,0,0,1,1,0,0,1,1,0
3,138740,25.4,0,0,10,0,1,0,1,0,...,0,0,0,1,1,0,1,0,1,0
4,299401,38.09,15,30,6,0,1,1,0,0,...,0,0,1,0,0,1,0,1,1,0


In [24]:
# View the data types from the final heart dataframe
final_heart_df.dtypes

Unnamed: 0                               int64
BMI                                    float64
PhysicalHealth                           int64
MentalHealth                             int64
SleepTime                                int64
HeartDisease_No                          int64
HeartDisease_Yes                         int64
Sex_Female                               int64
Sex_Male                                 int64
AgeCategory_18-24                        int64
AgeCategory_25-29                        int64
AgeCategory_30-34                        int64
AgeCategory_35-39                        int64
AgeCategory_40-44                        int64
AgeCategory_45-49                        int64
AgeCategory_50-54                        int64
AgeCategory_55-59                        int64
AgeCategory_60-64                        int64
AgeCategory_65-69                        int64
AgeCategory_70-74                        int64
AgeCategory_75-79                        int64
AgeCategory_8

In [25]:
# View all the columns from the final heart dataframe
pd.set_option('display.max_columns', None)
final_heart_df.head()

Unnamed: 0.1,Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease_No,HeartDisease_Yes,Sex_Female,Sex_Male,AgeCategory_18-24,AgeCategory_25-29,AgeCategory_30-34,AgeCategory_35-39,AgeCategory_40-44,AgeCategory_45-49,AgeCategory_50-54,AgeCategory_55-59,AgeCategory_60-64,AgeCategory_65-69,AgeCategory_70-74,AgeCategory_75-79,AgeCategory_80 or older,Smoking_No,Smoking_Yes,Stroke_No,Stroke_Yes,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy),PhysicalActivity_No,PhysicalActivity_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,DiffWalking_No,DiffWalking_Yes,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_No,Asthma_Yes,KidneyDisease_No,KidneyDisease_Yes,SkinCancer_No,SkinCancer_Yes
0,59737,34.96,3,0,8,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0
1,277485,25.1,0,0,9,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1
2,58188,24.33,0,0,8,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0
3,138740,25.4,0,0,10,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,1,0
4,299401,38.09,15,30,6,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,1,0


In [26]:
# Check the dataframe for demographics
demographics_df.head()

Unnamed: 0,RespondentID,Sex,AgeRange,Race
0,0,Female,55-59,White
1,1,Female,80 or older,White
2,2,Male,65-69,White
3,3,Female,75-79,White
4,4,Female,40-44,White


In [27]:
# Check the dataframe for health metrics
health_metrics_df.head()

Unnamed: 0,RespondentID,BMI,Smoking,AlcoholDrinking,HealthyPhysicalDays,HealthyMentalDays,PhysicalActivity,GeneralHealth,SleepHours
0,0,16.6,True,False,3,30,True,Very good,5
1,1,20.34,False,False,0,0,True,Very good,7
2,2,26.58,True,False,20,30,True,Fair,8
3,3,24.21,False,False,0,0,False,Good,6
4,4,23.71,False,False,28,0,True,Very good,8


In [28]:
# Check the dataframe for morbidity
morbidity_df.head()

Unnamed: 0,RespondentID,HeartDisease,Stroke,DifficultyWalking,Diabetes,Asthma,KidneyDisease,SkinCancer
0,0,False,False,False,Yes,True,False,True
1,1,False,True,False,No,False,False,False
2,2,False,False,False,Yes,True,False,False
3,3,False,False,False,No,False,False,True
4,4,False,False,True,No,False,False,False


In [29]:
# Convert DataFrames to CSV Files
export_path = "Resources/Demographics_df.csv"
demographics_df.to_csv(export_path, index = False)

In [30]:
# Convert DataFrames to CSV Files
export_path_2 = "Resources/HealthMetrics_df.csv"
health_metrics_df.to_csv(export_path_2, index = False)

In [31]:
# Convert DataFrames to CSV Files
export_path_3 = "Resources/Morbidity_df.csv"
morbidity_df.to_csv(export_path_3, index = False)