# Final Clean data
- Drop Index
- Convert Age to years
- Calculate BMI (but does not use BMI because they are heavily related to height, weight)
- Filter humanly possible range:
	- ap_hi [40,200]
	- ap_lo [40,140]
	- BMI [10,80]

In [1]:
import pandas as pd

# Create a copy of the dataset
cardio_raw = pd.read_csv('cardio_train.csv', sep=';')
cardio = cardio_raw.copy()

# Numerical and categorical attributes
num_attribs = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
cat_attribs = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

# Covert the data type of categorical attributes into "category"
for cat_attrib in cat_attribs:
  cardio[cat_attrib] = cardio[cat_attrib].astype('category')

# Remove the id column
cardio = cardio.drop(columns='id')

# Convert the age into years
cardio['age'] = cardio['age'] // 365.25

# Convert gender to binary categorical attribute
# Male = 1, Female = 0
cardio['gender'] = (cardio['gender'].astype('int') - 1).astype('category')

# Extract Systolic (ap_hi) and Diastolic (ap_lo) blood pressure features
ap_hi = cardio['ap_hi']
ap_lo = cardio['ap_lo']

# Drop data points those have ap_hi and ap_lo lower than 40 and higher than 200
cardio = cardio.drop(cardio.loc[(ap_hi < 40) | (ap_hi > 200) | (ap_lo < 40) | (ap_lo > 140)].index)

# Some data points have ap_lo > ap_hi, we will filter these as well
cardio = cardio.drop(cardio.loc[ap_lo > ap_hi].index)

# Caculate BMI for every patients
cardio['bmi'] = cardio['weight'] / (cardio['height'] / 100) ** 2

# Extract BMI (bmi) feature
bmi = cardio['bmi']

# Drop data points those have bmi lower than 10 and higher than 80
cardio = cardio.drop(cardio.loc[(bmi < 10) | (bmi > 80)].index)

# Drop BMI (bmi) feature
cardio = cardio.drop(columns=['bmi'])


cardio.to_csv("data.csv",index = False) ## Export for Application
