### Import Packages and Fetch Data

In [None]:
# Setup
import numpy as np
import pandas as pd
import os

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images/after_preprocessing_images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Importing the dataset
pre_heart = pd.read_csv('dataset/BRFSS2020.csv')

# make new dataset with selected columns

pre_heart.rename(columns= {'_michd': 'heartDisease', '_bmi5': 'BMI', 
                   'smoke100': 'smoking', '_rfdrhv7': 'alcoholDrinking', 
                   'cvdstrk3': 'stroke', 'physhlth': 'physicalHealth', 
                   'menthlth': 'mentalHealth', 'diffwalk': 'diffWalking', 
                   'sexvar': 'sex', '_ageg5yr': 'ageCategory', 
                   '_imprace': 'race', 'diabete4': 'diabetic', 
                   '_totinda': 'physicalActivity', 'genhlth': 'genHealth', 
                   'sleptim1': 'sleepTime', 'asthma3': 'asthma',
                   'chckdny2': 'kidneyDisease', 'chcscncr': 'skinCancer'}, 
          inplace=True)

pre_heart.head()

In [None]:
# make new dataset with only specific columns
all_heart = pre_heart[['heartDisease','BMI','smoking','alcoholDrinking','stroke','physicalHealth','mentalHealth','diffWalking','sex','ageCategory','race','diabetic','physicalActivity','genHealth','sleepTime','asthma','kidneyDisease','skinCancer']].copy()
all_heart.info()

In [7]:
all_heart.describe()

Unnamed: 0,heartDisease,BMI,smoking,alcoholDrinking,stroke,physicalHealth,mentalHealth,diffWalking,sex,ageCategory,race,diabetic,physicalActivity,genHealth,sleepTime,asthma,kidneyDisease,skinCancer
count,398387.0,360601.0,384098.0,401958.0,401955.0,401953.0,401953.0,386678.0,401958.0,401958.0,401958.0,401952.0,401958.0,401950.0,401955.0,401955.0,401952.0,401955.0
mean,1.914247,2830.631271,1.631826,1.670433,1.977167,66.143629,61.45374,1.873326,1.542385,7.667363,1.715883,2.763459,1.250894,2.452947,7.944698,1.883748,1.98304,1.926703
std,0.28,638.164868,0.703787,2.12056,0.359852,34.827121,36.951143,0.507526,0.498201,3.660209,1.464755,0.735095,0.531942,1.085778,7.973949,0.473024,0.387724,0.426623
min,1.0,1202.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,2399.0,1.0,1.0,2.0,30.0,15.0,2.0,1.0,5.0,1.0,3.0,1.0,2.0,6.0,2.0,2.0,2.0
50%,2.0,2732.0,2.0,1.0,2.0,88.0,88.0,2.0,2.0,8.0,1.0,3.0,1.0,2.0,7.0,2.0,2.0,2.0
75%,2.0,3138.0,2.0,1.0,2.0,88.0,88.0,2.0,2.0,11.0,1.0,3.0,1.0,3.0,8.0,2.0,2.0,2.0
max,2.0,9843.0,9.0,9.0,9.0,99.0,99.0,9.0,2.0,14.0,6.0,9.0,9.0,9.0,99.0,9.0,9.0,9.0


### Visualizing the Data

### Experimenting with Attribute Combinations