# Purpose
The purpose of this notebook is to perform data preprocessing on the two datasets located within the raw_data directory. This program will:
1. Find and handle missing/NaN/null values
2. Find and handle noisy and/or incorrect data
3. Convert all categorical features to numeric features (logistic regression, SVM, and neural network models require numeric features)
4. Normalize all features to range [0, 1] (required for SVMs since they use distance as a metric; recommended for neural networks)

# Authors
1. Mason Adsero
2. Riley Cullen


# Data Preprocessing

In [324]:
import pandas as pd

df_2020 = pd.read_csv("./rawdata/heart_2020_cleaned.csv")
df_uci  = pd.read_csv("./rawdata/heart_disease_uci.csv")
df_2020.head(5)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [325]:
df_2020.tail(5)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No
319794,No,46.56,No,No,No,0.0,0.0,No,Female,80 or older,Hispanic,No,Yes,Good,8.0,No,No,No


In [326]:
df_uci.head(5)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [327]:
df_uci.tail(5)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0
919,920,62,Male,VA Long Beach,atypical angina,120.0,254.0,False,lv hypertrophy,93.0,True,0.0,,,,1


### Handling missing/NaN/null values

In [328]:
print("2020 Dataset")
print(df_2020.isnull().sum())

print("\nUCI Dataset")
print(df_uci.isnull().sum())

2020 Dataset
HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

UCI Dataset
id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


In [329]:
from sklearn.impute import SimpleImputer
# Handling missing data in the UCI dataset

imputer = SimpleImputer()

# trestbps - impute w/ average value
df_uci[['trestbps']] = imputer.fit_transform(df_uci[['trestbps']])
# chol - impute w/ average value
df_uci[['chol']] = imputer.fit_transform(df_uci[['chol']])
# fbs - Add "Unknown" category
df_uci[['fbs']] = df_uci[['fbs']].replace(pd.NA, "Unknown")
# restecg - Add "Unknown" category
df_uci[['restecg']] = df_uci[['restecg']].replace(pd.NA, "Unknown")
# thalch - impute w/ average value
df_uci[['thalch']] = imputer.fit_transform(df_uci[['thalch']])
# exang - add "Unknown" category
df_uci[['exang']] = df_uci[['exang']].replace(pd.NA, "Unknown")
# oldpeak - impute w/ average
df_uci[['oldpeak']] = imputer.fit_transform(df_uci[['oldpeak']])
# slope - add "Unknown" category
df_uci[['slope']] = df_uci[['slope']].replace(pd.NA, "Unknown")
# ca - impute w/ average 
df_uci[['ca']] = imputer.fit_transform(df_uci[['ca']])
# thal - add "Unknown" category
df_uci[['thal']] = df_uci[['thal']].replace(pd.NA, "Unknown")

print("\nUCI Dataset")
print(df_uci.isnull().sum())


UCI Dataset
id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64


### Handling noisy/incorrect data values

In [330]:
"""
./rawdata/heart_disease_uci.csv
- Convert explanatory column to a binary feature 
"""
print("\ndf_uci:")
print(df_uci.num.unique())
df_uci[['num']] = df_uci[['num']].replace(1, 1)
df_uci[['num']] = df_uci[['num']].replace(2, 1)
df_uci[['num']] = df_uci[['num']].replace(3, 1)
df_uci[['num']] = df_uci[['num']].replace(4, 1)

df_uci[['fbs']] = df_uci[['fbs']].replace(True, "True")
df_uci[['fbs']] = df_uci[['fbs']].replace(False, "False")

df_uci[['exang']] = df_uci[['exang']].replace(True, "True")
df_uci[['exang']] = df_uci[['exang']].replace(False, "False")
print(df_uci.num.unique())


df_uci:
[0 2 1 3 4]
[0 1]


### Converting categorical features to numerical features

In [331]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelBinarizer
import numpy as np
one_hot_enc = OneHotEncoder()
ordinal_enc = OrdinalEncoder()
binary_enc = LabelBinarizer()

# 2020 dataset 
df_2020['HeartDisease'] = binary_enc.fit_transform(df_2020['HeartDisease'])
df_2020['Smoking'] = binary_enc.fit_transform(df_2020['Smoking'])
df_2020['AlcoholDrinking'] = binary_enc.fit_transform(df_2020['AlcoholDrinking'])
df_2020['Stroke'] = binary_enc.fit_transform(df_2020['Stroke'])
df_2020['DiffWalking'] = binary_enc.fit_transform(df_2020['DiffWalking'])
df_2020['Sex'] = binary_enc.fit_transform(df_2020['Sex'])
df_2020['PhysicalActivity'] = binary_enc.fit_transform(df_2020['PhysicalActivity'])
df_2020['Asthma'] = binary_enc.fit_transform(df_2020['Asthma'])
df_2020['KidneyDisease'] = binary_enc.fit_transform(df_2020['KidneyDisease'])
df_2020['SkinCancer'] = binary_enc.fit_transform(df_2020['SkinCancer'])

df_2020[['AgeCategory']] = ordinal_enc.fit_transform(df_2020[['AgeCategory']])

race_encoded_df = pd.DataFrame(one_hot_enc.fit_transform(df_2020[['Race']]).toarray())
race_encoded_df.columns = np.array(one_hot_enc.categories_)[0].tolist()
df_2020 = df_2020.join(race_encoded_df)
df_2020.drop('Race', axis=1, inplace=True)

diabetic_encoded_df = pd.DataFrame(one_hot_enc.fit_transform(df_2020[['Diabetic']]).toarray())
diabetic_encoded_df.columns = np.array(one_hot_enc.categories_)[0].tolist()
df_2020 = df_2020.join(diabetic_encoded_df)
df_2020.drop('Diabetic', axis=1, inplace=True)

df_2020[['GenHealth']] = df_2020[['GenHealth']].replace('Poor', 0)
df_2020[['GenHealth']] = df_2020[['GenHealth']].replace('Fair', 1)
df_2020[['GenHealth']] = df_2020[['GenHealth']].replace('Good', 2)
df_2020[['GenHealth']] = df_2020[['GenHealth']].replace('Very good', 3)
df_2020[['GenHealth']] = df_2020[['GenHealth']].replace('Excellent', 4)
df_2020


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,American Indian/Alaskan Native,Asian,Black,Hispanic,Other,White,No,"No, borderline diabetes",Yes,Yes (during pregnancy)
0,0,16.60,1,0,0,3.0,30.0,0,0,7.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0,20.34,0,0,1,0.0,0.0,0,0,12.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0,26.58,1,0,0,20.0,30.0,0,1,9.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0,24.21,0,0,0,0.0,0.0,0,0,11.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0,23.71,0,0,0,28.0,0.0,1,0,4.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,1,8.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
319791,0,29.84,1,0,0,0.0,0.0,0,1,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
319792,0,24.24,0,0,0,0.0,0.0,0,0,5.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
319793,0,32.81,0,0,0,0.0,0.0,0,0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [332]:
def add_prefix(prefix, l):
    tmp = []
    for e in l: 
        tmp.append(prefix + e)
    return tmp
    

df_uci['sex'] = binary_enc.fit_transform(df_uci['sex'])

# df_uci[['dataset']] = ordinal_enc.fit_transform(df_uci[['dataset']])
dataset_df = pd.DataFrame(one_hot_enc.fit_transform(df_uci[['dataset']]).toarray())
dataset_df.columns = np.array(one_hot_enc.categories_)[0].tolist()
df_uci = df_uci.join(dataset_df)
df_uci.drop('dataset', axis=1, inplace=True)

cp_df = pd.DataFrame(one_hot_enc.fit_transform(df_uci[['cp']]).toarray())
cp_df.columns = np.array(one_hot_enc.categories_)[0].tolist()
df_uci = df_uci.join(cp_df)
df_uci.drop('cp', axis=1, inplace=True)

restecg_df = pd.DataFrame(one_hot_enc.fit_transform(df_uci[['restecg']]).toarray())
restecg_df.columns = np.array(one_hot_enc.categories_)[0].tolist()
df_uci = df_uci.join(restecg_df)
df_uci.drop('restecg', axis=1, inplace=True)

slope_df = pd.DataFrame(one_hot_enc.fit_transform(df_uci[['slope']]).toarray())
slope_df.columns = add_prefix('slope_', np.array(one_hot_enc.categories_)[0].tolist())
df_uci = df_uci.join(slope_df)
df_uci.drop('slope', axis=1, inplace=True)

thal_df = pd.DataFrame(one_hot_enc.fit_transform(df_uci[['thal']]).toarray())
thal_df.columns = add_prefix('thal_', np.array(one_hot_enc.categories_)[0].tolist())
df_uci = df_uci.join(thal_df)
df_uci.drop('thal', axis=1, inplace=True)

fbs_df = pd.DataFrame(one_hot_enc.fit_transform(df_uci[['fbs']]).toarray())
fbs_cols = add_prefix("FBS_", np.array(one_hot_enc.categories_)[0].tolist())
fbs_df.columns = fbs_cols
df_uci = df_uci.join(fbs_df)
df_uci.drop('fbs', axis=1, inplace=True)

exang_df = pd.DataFrame(one_hot_enc.fit_transform(df_uci[['exang']]).toarray())
exang_cols = add_prefix('exang_', np.array(one_hot_enc.categories_)[0].tolist())
exang_df.columns = exang_cols
df_uci = df_uci.join(exang_df)
df_uci.drop('exang', axis=1, inplace=True)

df_uci.drop('id', axis=1, inplace=True)

df_uci

Unnamed: 0,age,sex,trestbps,chol,thalch,oldpeak,ca,num,Cleveland,Hungary,...,thal_Unknown,thal_fixed defect,thal_normal,thal_reversable defect,FBS_False,FBS_True,FBS_Unknown,exang_False,exang_True,exang_Unknown
0,63,1,145.000000,233.0,150.000000,2.300000,0.000000,0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,67,1,160.000000,286.0,108.000000,1.500000,3.000000,1,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,67,1,120.000000,229.0,129.000000,2.600000,2.000000,1,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,37,1,130.000000,250.0,187.000000,3.500000,0.000000,0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,41,0,130.000000,204.0,172.000000,1.400000,0.000000,0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,0,127.000000,333.0,154.000000,0.000000,0.676375,1,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
916,62,1,132.132404,139.0,137.545665,0.878788,0.676375,0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
917,55,1,122.000000,223.0,100.000000,0.000000,0.676375,1,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
918,58,1,132.132404,385.0,137.545665,0.878788,0.676375,0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [333]:
# Write pre-normalized data to csv
df_2020.to_csv("./input/heart_2020.csv")
df_uci.to_csv("./input/heart_disease_uci.csv")

### Normalizing data

In [334]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df_2020[df_2020.columns] = scaler.fit_transform(df_2020[df_2020.columns])

df_uci[df_uci.columns] = scaler.fit_transform(df_uci[df_uci.columns])

df_2020.to_csv("./input/heart_2020_normalized.csv", index=False)
df_uci.to_csv("./input/heart_disease_uci_normalized.csv", index=False)