# Data Preprocessing

In [89]:
from scipy.io import arff
from io import StringIO
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
import pandas as pd

In [63]:
data_file = '../data/ThoraricSurgery.arff'
data, meta = arff.loadarff(data_file)
meta

Dataset: 'Thoracic_Surgery_Data'
	DGN's type is nominal, range is ('DGN3', 'DGN2', 'DGN4', 'DGN6', 'DGN5', 'DGN8', 'DGN1')
	PRE4's type is numeric
	PRE5's type is numeric
	PRE6's type is nominal, range is ('PRZ2', 'PRZ1', 'PRZ0')
	PRE7's type is nominal, range is ('T', 'F')
	PRE8's type is nominal, range is ('T', 'F')
	PRE9's type is nominal, range is ('T', 'F')
	PRE10's type is nominal, range is ('T', 'F')
	PRE11's type is nominal, range is ('T', 'F')
	PRE14's type is nominal, range is ('OC11', 'OC14', 'OC12', 'OC13')
	PRE17's type is nominal, range is ('T', 'F')
	PRE19's type is nominal, range is ('T', 'F')
	PRE25's type is nominal, range is ('T', 'F')
	PRE30's type is nominal, range is ('T', 'F')
	PRE32's type is nominal, range is ('T', 'F')
	AGE's type is numeric
	Risk1Yr's type is nominal, range is ('T', 'F')

In [90]:
outcomes_df = pd.DataFrame.from_records(data)
nominal_features = ['DGN', 'PRE6',  'PRE14']
TF_features = ['PRE7', 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE17', 'PRE19', 'PRE19',
              'PRE25', 'PRE30', 'PRE32', 'Risk1Yr']
numeric_features = ['PRE4', 'PRE5', 'AGE']
nominal_dict = {feature: [] for feature in nominal_features}
TF_dict = {feature: [] for feature in TF_features}

In [91]:
for feature in nominal_features:
    nominal_dict[feature][:] = (pd.Series(outcomes_df[feature].values).unique().tolist())
for feature in TF_features:
    TF_dict[feature][:] = (pd.Series(outcomes_df[feature].values).unique().tolist())

## One Hot Encoding

In [66]:
outcomes_df.head(5)

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,b'DGN2',2.88,2.16,b'PRZ1',b'F',b'F',b'F',b'T',b'T',b'OC14',b'F',b'F',b'F',b'T',b'F',60.0,b'F'
1,b'DGN3',3.4,1.88,b'PRZ0',b'F',b'F',b'F',b'F',b'F',b'OC12',b'F',b'F',b'F',b'T',b'F',51.0,b'F'
2,b'DGN3',2.76,2.08,b'PRZ1',b'F',b'F',b'F',b'T',b'F',b'OC11',b'F',b'F',b'F',b'T',b'F',59.0,b'F'
3,b'DGN3',3.68,3.04,b'PRZ0',b'F',b'F',b'F',b'F',b'F',b'OC11',b'F',b'F',b'F',b'F',b'F',54.0,b'F'
4,b'DGN3',2.44,0.96,b'PRZ2',b'F',b'T',b'F',b'T',b'T',b'OC11',b'F',b'F',b'F',b'T',b'F',73.0,b'T'


In [67]:
outcomes_df
for feature in outcomes_df.columns.values:
    new_column = pd.Series()
    for i, row in outcomes_df.iterrows():
        if feature in nominal_features:
            encoding = np.zeros(len(nominal_dict[feature]))
            index = nominal_dict[feature].index(row[feature])
            encoding[index] = 1
            new_column.append(encoding)
        if feature in TF_features:
            outcomes_df.loc[i,feature] = TF_dict[feature].index(row[feature])
    outcomes_df.drop([feature], axis=1)
    outcomes_df[feature] = new_column

In [68]:
outcomes_df

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,0,2.88,2.16,0,0,0,0,0,0,0,0,0,0,0,0,60.0,0
1,1,3.40,1.88,1,0,0,0,1,1,1,0,0,0,0,0,51.0,0
2,1,2.76,2.08,0,0,0,0,0,1,2,0,0,0,0,0,59.0,0
3,1,3.68,3.04,1,0,0,0,1,1,2,0,0,0,1,0,54.0,0
4,1,2.44,0.96,2,0,1,0,0,0,2,0,0,0,0,0,73.0,1
5,1,2.48,1.88,0,0,0,0,0,1,2,0,0,0,1,0,51.0,0
6,1,4.36,3.28,0,0,0,0,0,1,1,1,0,0,0,0,59.0,1
7,0,3.19,2.50,0,0,0,0,0,1,2,0,0,1,0,0,66.0,1
8,1,3.16,2.64,2,0,0,0,0,0,2,0,0,0,0,0,68.0,0
9,1,2.32,2.16,0,0,0,0,0,1,2,0,0,0,0,0,54.0,0


In [86]:
df = outcomes_df.groupby(['Risk1Yr'], sort=False).size().reset_index(name='Count')

In [87]:
df

Unnamed: 0,Risk1Yr,Count
0,0,400
1,1,350


In [83]:
Risk1Yr_1_df = outcomes_df.loc[outcomes_df['Risk1Yr'] == 1]
Risk1Yr_1_df

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
4,1,2.44,0.96,2,0,1,0,0,0,2,0,0,0,0,0,73.0,1
6,1,4.36,3.28,0,0,0,0,0,1,1,1,0,0,0,0,59.0,1
7,0,3.19,2.50,0,0,0,0,0,1,2,0,0,1,0,0,66.0,1
13,0,3.98,3.06,2,0,0,0,0,0,0,0,0,0,0,0,80.0,1
24,3,4.32,3.20,1,0,0,0,1,1,2,0,0,0,1,0,58.0,1
27,1,3.40,3.06,0,0,0,0,0,0,2,0,0,0,0,0,68.0,1
30,1,3.24,2.40,0,1,1,0,1,1,0,0,0,0,0,0,55.0,1
40,4,3.80,2.98,0,0,0,0,0,1,2,0,0,0,0,0,60.0,1
41,0,3.24,2.52,0,0,0,0,0,1,1,0,0,0,0,0,63.0,1
43,4,2.68,2.12,1,0,0,0,0,1,1,0,0,0,0,0,51.0,1


In [85]:
for i in range(4):
    outcomes_df = outcomes_df.append(Risk1Yr_1_df)
outcomes_df

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,0,2.88,2.16,0,0,0,0,0,0,0,0,0,0,0,0,60.0,0
1,1,3.40,1.88,1,0,0,0,1,1,1,0,0,0,0,0,51.0,0
2,1,2.76,2.08,0,0,0,0,0,1,2,0,0,0,0,0,59.0,0
3,1,3.68,3.04,1,0,0,0,1,1,2,0,0,0,1,0,54.0,0
4,1,2.44,0.96,2,0,1,0,0,0,2,0,0,0,0,0,73.0,1
5,1,2.48,1.88,0,0,0,0,0,1,2,0,0,0,1,0,51.0,0
6,1,4.36,3.28,0,0,0,0,0,1,1,1,0,0,0,0,59.0,1
7,0,3.19,2.50,0,0,0,0,0,1,2,0,0,1,0,0,66.0,1
8,1,3.16,2.64,2,0,0,0,0,0,2,0,0,0,0,0,68.0,0
9,1,2.32,2.16,0,0,0,0,0,1,2,0,0,0,0,0,54.0,0


In [None]:
def one_hot(keys, outcomes_df):
    for key in keys:
        encoder = pd.get_dummies(glass_df['Glass_Type'], "Glass_Type")
        glass_df = glass_df.drop('Glass_Type', axis = 1)
        glass_df = glass_df.join(encoder)
        glass_df['Glass_Type'] = glass_df[['Glass_Type_1','Glass_Type_2', 'Glass_Type_3','Glass_Type_5','Glass_Type_6','Glass_Type_7']].values.tolist()
        glass_df = glass_df.drop(['Glass_Type_1','Glass_Type_2', 'Glass_Type_3','Glass_Type_5','Glass_Type_6','Glass_Type_7'], axis = 1)
    return glass_df

## Train Test Split

In [92]:
outcomes_train, outcomes_test = train_test_split(outcomes_df, test_size=0.2, stratify=outcomes_df['Risk1Yr'].values.tolist())
outcomes_train_df = pd.DataFrame(outcomes_train)
outcomes_test_df = pd.DataFrame(outcomes_test)