# Data Preprocessing

In [34]:
import pandas as pd
import numpy as np

## Read Dataset

In [35]:
df = pd.read_csv("ehresp_2014.csv")
df.head()

Unnamed: 0,tucaseid,tulineno,eeincome1,erbmi,erhhch,erincome,erspemch,ertpreat,ertseat,ethgt,...,eumeat,eumilk,euprpmel,eusoda,eustores,eustreason,eutherm,euwgt,euwic,exincome1
0,20140101140007,1,-2,33.200001,1,-1,-1,30,2,0,...,1,2,1,-1,2,1,2,170,1,2
1,20140101140011,1,1,22.700001,3,1,-1,45,14,0,...,1,2,1,-1,1,2,2,128,2,0
2,20140101140028,1,2,49.400002,3,5,-1,60,0,0,...,-1,-1,2,2,-1,-1,-1,270,2,12
3,20140101140063,1,-2,-1.0,3,-1,-1,0,0,0,...,2,2,1,1,2,6,-1,-2,2,2
4,20140101140168,1,2,31.0,3,5,-1,65,0,0,...,1,2,1,2,1,1,2,210,1,0


## Filter Columns

In [36]:
df_new = df[['erbmi','ertpreat','eudrink','eueat','euexercise','eufastfd','eufdsit','euprpmel','eusoda','eugenhth']]
df_new.to_csv('intermediate_data.csv')
df_new.head()

Unnamed: 0,erbmi,ertpreat,eudrink,eueat,euexercise,eufastfd,eufdsit,euprpmel,eusoda,eugenhth
0,33.200001,30,2,1,2,2,1,1,-1,1
1,22.700001,45,2,1,2,1,1,1,-1,2
2,49.400002,60,1,2,2,2,1,2,2,5
3,-1.0,0,1,2,2,2,1,1,1,2
4,31.0,65,1,2,1,2,1,1,2,4


## Drop NaN rows

In [37]:
df_new = pd.read_csv('intermediate_data.csv',na_values=['-1','-2','-3'])
df_new = df_new.dropna()
df_new.head()

Unnamed: 0.1,Unnamed: 0,erbmi,ertpreat,eudrink,eueat,euexercise,eufastfd,eufdsit,euprpmel,eusoda,eugenhth
2,2,49.400002,60,1.0,2.0,2.0,2.0,1.0,2.0,2.0,5.0
4,4,31.0,65,1.0,2.0,1.0,2.0,1.0,1.0,2.0,4.0
5,5,30.700001,20,1.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0
6,6,33.299999,30,1.0,1.0,2.0,1.0,1.0,3.0,2.0,2.0
9,9,28.299999,80,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0


## Codify BMI

In [38]:
# 1 - Underweight 2 - Normal  3 - Overweight
def codify_bmi(x):
    if(x < 18.5):
        return 1
    elif(x < 25):
        return 2
    else:
        return 3

In [39]:
df_new['codified_bmi'] = df_new['erbmi'].apply(codify_bmi)
df_new.head()

Unnamed: 0.1,Unnamed: 0,erbmi,ertpreat,eudrink,eueat,euexercise,eufastfd,eufdsit,euprpmel,eusoda,eugenhth,codified_bmi
2,2,49.400002,60,1.0,2.0,2.0,2.0,1.0,2.0,2.0,5.0,3
4,4,31.0,65,1.0,2.0,1.0,2.0,1.0,1.0,2.0,4.0,3
5,5,30.700001,20,1.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,3
6,6,33.299999,30,1.0,1.0,2.0,1.0,1.0,3.0,2.0,2.0,3
9,9,28.299999,80,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,3


## Change Health Codes


In [40]:
# Excellent,Very Good,Good health to 1
# Fair,Poor health to 0
def codify_gen_health(x):
    if(x==1):
        return 1
    elif(x==2):
        return 1
    elif(x==3):
        return 1
    elif(x==4):
        return 2
    elif(x==5):
        return 2
    else:
        return 2

In [43]:
df_new['codified_health'] = df_new['eugenhth'].apply(codify_gen_health)
df_new.head()

Unnamed: 0.1,Unnamed: 0,erbmi,ertpreat,eudrink,eueat,euexercise,eufastfd,eufdsit,euprpmel,eusoda,eugenhth,codified_bmi,codified_health
2,2,49.400002,60,1.0,2.0,2.0,2.0,1.0,2.0,2.0,5.0,3,2
4,4,31.0,65,1.0,2.0,1.0,2.0,1.0,1.0,2.0,4.0,3,2
5,5,30.700001,20,1.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,3,1
6,6,33.299999,30,1.0,1.0,2.0,1.0,1.0,3.0,2.0,2.0,3,1
9,9,28.299999,80,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,3,1


## Export to new CSV

In [42]:
df_new.to_csv('dataset_preprocessed.csv',index=False)