# Data Preprocessing

In [1]:
from scipy.io import arff
from io import StringIO
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
import pandas as pd

In [2]:
data_file = '../data/ThoraricSurgery.arff'
data, meta = arff.loadarff(data_file)
meta

Dataset: 'Thoracic_Surgery_Data'
	DGN's type is nominal, range is ('DGN3', 'DGN2', 'DGN4', 'DGN6', 'DGN5', 'DGN8', 'DGN1')
	PRE4's type is numeric
	PRE5's type is numeric
	PRE6's type is nominal, range is ('PRZ2', 'PRZ1', 'PRZ0')
	PRE7's type is nominal, range is ('T', 'F')
	PRE8's type is nominal, range is ('T', 'F')
	PRE9's type is nominal, range is ('T', 'F')
	PRE10's type is nominal, range is ('T', 'F')
	PRE11's type is nominal, range is ('T', 'F')
	PRE14's type is nominal, range is ('OC11', 'OC14', 'OC12', 'OC13')
	PRE17's type is nominal, range is ('T', 'F')
	PRE19's type is nominal, range is ('T', 'F')
	PRE25's type is nominal, range is ('T', 'F')
	PRE30's type is nominal, range is ('T', 'F')
	PRE32's type is nominal, range is ('T', 'F')
	AGE's type is numeric
	Risk1Yr's type is nominal, range is ('T', 'F')

In [3]:
outcomes_df = pd.DataFrame.from_records(data)
nominal_features = ['DGN', 'PRE6',  'PRE14']
TF_features = ['PRE7', 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE17', 'PRE19', 'PRE19',
              'PRE25', 'PRE30', 'PRE32', 'Risk1Yr']
numeric_features = ['PRE4', 'PRE5', 'AGE']
nominal_dict = {feature: [] for feature in nominal_features}
TF_dict = {feature: [] for feature in TF_features}

In [4]:
for feature in nominal_features:
    nominal_dict[feature][:] = (pd.Series(outcomes_df[feature].values).unique().tolist())
for feature in TF_features:
    TF_dict[feature][:] = (pd.Series(outcomes_df[feature].values).unique().tolist())

## One Hot Encoding

In [5]:
outcomes_df.head(5)

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,b'DGN2',2.88,2.16,b'PRZ1',b'F',b'F',b'F',b'T',b'T',b'OC14',b'F',b'F',b'F',b'T',b'F',60.0,b'F'
1,b'DGN3',3.4,1.88,b'PRZ0',b'F',b'F',b'F',b'F',b'F',b'OC12',b'F',b'F',b'F',b'T',b'F',51.0,b'F'
2,b'DGN3',2.76,2.08,b'PRZ1',b'F',b'F',b'F',b'T',b'F',b'OC11',b'F',b'F',b'F',b'T',b'F',59.0,b'F'
3,b'DGN3',3.68,3.04,b'PRZ0',b'F',b'F',b'F',b'F',b'F',b'OC11',b'F',b'F',b'F',b'F',b'F',54.0,b'F'
4,b'DGN3',2.44,0.96,b'PRZ2',b'F',b'T',b'F',b'T',b'T',b'OC11',b'F',b'F',b'F',b'T',b'F',73.0,b'T'


In [6]:
outcomes_df
for feature in outcomes_df.columns.values:
    new_column = []
    for i, row in outcomes_df.iterrows():
        if feature in nominal_features:
            encoding = np.zeros(len(nominal_dict[feature])).tolist()
            index = nominal_dict[feature].index(row[feature])
            encoding[index] = 1
            new_column.append(str(encoding))
        if feature in TF_features:
            outcomes_df.loc[i,feature] = TF_dict[feature].index(row[feature])
    if feature in nominal_features:
        outcomes_df.drop([feature], axis=1)
        outcomes_df[feature] = pd.DataFrame(new_column, columns=[feature])

In [7]:
outcomes_df

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,"[1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",2.88,2.16,"[1, 0.0, 0.0]",0,0,0,0,0,"[1, 0.0, 0.0, 0.0]",0,0,0,0,0,60.0,0
1,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",3.40,1.88,"[0.0, 1, 0.0]",0,0,0,1,1,"[0.0, 1, 0.0, 0.0]",0,0,0,0,0,51.0,0
2,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",2.76,2.08,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,59.0,0
3,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",3.68,3.04,"[0.0, 1, 0.0]",0,0,0,1,1,"[0.0, 0.0, 1, 0.0]",0,0,0,1,0,54.0,0
4,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",2.44,0.96,"[0.0, 0.0, 1]",0,1,0,0,0,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,73.0,1
5,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",2.48,1.88,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 0.0, 1, 0.0]",0,0,0,1,0,51.0,0
6,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",4.36,3.28,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 1, 0.0, 0.0]",1,0,0,0,0,59.0,1
7,"[1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",3.19,2.50,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 0.0, 1, 0.0]",0,0,1,0,0,66.0,1
8,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",3.16,2.64,"[0.0, 0.0, 1]",0,0,0,0,0,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,68.0,0
9,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",2.32,2.16,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,54.0,0


In [8]:
df = outcomes_df.groupby(['Risk1Yr'], sort=False).size().reset_index(name='Count')

In [9]:
df

Unnamed: 0,Risk1Yr,Count
0,0,400
1,1,70


In [10]:
Risk1Yr_1_df = outcomes_df.loc[outcomes_df['Risk1Yr'] == 1]
Risk1Yr_1_df

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
4,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",2.44,0.96,"[0.0, 0.0, 1]",0,1,0,0,0,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,73.0,1
6,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",4.36,3.28,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 1, 0.0, 0.0]",1,0,0,0,0,59.0,1
7,"[1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",3.19,2.50,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 0.0, 1, 0.0]",0,0,1,0,0,66.0,1
13,"[1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",3.98,3.06,"[0.0, 0.0, 1]",0,0,0,0,0,"[1, 0.0, 0.0, 0.0]",0,0,0,0,0,80.0,1
24,"[0.0, 0.0, 0.0, 1, 0.0, 0.0, 0.0]",4.32,3.20,"[0.0, 1, 0.0]",0,0,0,1,1,"[0.0, 0.0, 1, 0.0]",0,0,0,1,0,58.0,1
27,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",3.40,3.06,"[1, 0.0, 0.0]",0,0,0,0,0,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,68.0,1
30,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",3.24,2.40,"[1, 0.0, 0.0]",1,1,0,1,1,"[1, 0.0, 0.0, 0.0]",0,0,0,0,0,55.0,1
40,"[0.0, 0.0, 0.0, 0.0, 1, 0.0, 0.0]",3.80,2.98,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,60.0,1
41,"[1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",3.24,2.52,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 1, 0.0, 0.0]",0,0,0,0,0,63.0,1
43,"[0.0, 0.0, 0.0, 0.0, 1, 0.0, 0.0]",2.68,2.12,"[0.0, 1, 0.0]",0,0,0,0,1,"[0.0, 1, 0.0, 0.0]",0,0,0,0,0,51.0,1


In [201]:
for i in range(4):
    outcomes_df = outcomes_df.append(Risk1Yr_1_df)
outcomes_df

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,"[1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",2.88,2.16,"[1, 0.0, 0.0]",0,0,0,0,0,"[1, 0.0, 0.0, 0.0]",0,0,0,0,0,60.0,0
1,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",3.40,1.88,"[0.0, 1, 0.0]",0,0,0,1,1,"[0.0, 1, 0.0, 0.0]",0,0,0,0,0,51.0,0
2,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",2.76,2.08,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,59.0,0
3,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",3.68,3.04,"[0.0, 1, 0.0]",0,0,0,1,1,"[0.0, 0.0, 1, 0.0]",0,0,0,1,0,54.0,0
4,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",2.44,0.96,"[0.0, 0.0, 1]",0,1,0,0,0,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,73.0,1
5,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",2.48,1.88,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 0.0, 1, 0.0]",0,0,0,1,0,51.0,0
6,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",4.36,3.28,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 1, 0.0, 0.0]",1,0,0,0,0,59.0,1
7,"[1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",3.19,2.50,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 0.0, 1, 0.0]",0,0,1,0,0,66.0,1
8,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",3.16,2.64,"[0.0, 0.0, 1]",0,0,0,0,0,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,68.0,0
9,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",2.32,2.16,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,54.0,0


## Train Test Split

In [11]:
outcomes_train, outcomes_test = train_test_split(outcomes_df, test_size=0.2, stratify=outcomes_df['Risk1Yr'].values.tolist())
outcomes_train_df = pd.DataFrame(outcomes_train)
outcomes_test_df = pd.DataFrame(outcomes_test)

In [12]:
# This function is used for standardizing the features to reduce the variance and improve the neural network
# performance
def standardization(x, mean, std):
    z_scores_np = (x - mean) / std
    return z_scores_np

# This function rescales the features using the standardization function for the training set
def train_standard_loop(keys):
    for key in keys:
        x_np = outcomes_train_df[key].values
        x_np_mean = x_np.mean()
        x_np_std = x_np.std()
        outcomes_train_df[key] = outcomes_train_df[key].apply(standardization,args=(x_np_mean,x_np_std))

# This function rescales the features using the standardization function for the test set
def test_standard_loop(keys):
    for key in keys[1:10]:
        x_np = outcomes_test_df[key].values
        x_np_mean = x_np.mean()
        x_np_std = x_np.std()
        outcomes_test_df[key] = outcomes_test_df[key].apply(standardization,args=(x_np_mean,x_np_std))

train_standard_loop(list(['PRE4', 'PRE5', 'AGE']))
test_standard_loop(list(['PRE4', 'PRE5', 'AGE']))

In [13]:
outcomes_train_df

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
257,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.423484,-0.203220,"[0.0, 1, 0.0]",0,0,0,1,1,"[0.0, 0.0, 1, 0.0]",0,0,0,0,0,-0.924523,0
287,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.377444,-0.212553,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 1, 0.0, 0.0]",0,0,0,0,0,0.075345,0
82,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.055163,-0.256104,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 1, 0.0, 0.0]",0,0,0,0,0,0.075345,0
300,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",1.095842,-0.147226,"[1, 0.0, 0.0]",0,1,0,0,0,"[0.0, 1, 0.0, 0.0]",0,0,0,0,0,0.186441,0
108,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.147243,-0.153448,"[0.0, 1, 0.0]",0,0,0,1,1,"[0.0, 0.0, 1, 0.0]",0,0,0,1,0,0.075345,0
172,"[1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.147243,-0.184556,"[1, 0.0, 0.0]",0,1,1,0,1,"[0.0, 1, 0.0, 0.0]",0,0,1,0,0,-0.035752,0
273,"[0.0, 0.0, 1, 0.0, 0.0, 0.0, 0.0]",0.082958,-0.172113,"[0.0, 0.0, 1]",0,0,0,0,0,"[0.0, 0.0, 1, 0.0]",1,0,0,0,0,1.075212,0
449,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.515564,-0.237439,"[1, 0.0, 0.0]",0,0,0,0,1,"[0.0, 1, 0.0, 0.0]",0,0,0,0,0,-1.035619,1
109,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.055163,-0.200110,"[1, 0.0, 0.0]",0,0,1,0,1,"[0.0, 1, 0.0, 0.0]",0,0,0,0,0,1.297405,0
265,"[0.0, 1, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.423484,-0.187666,"[0.0, 1, 0.0]",0,0,0,1,1,"[0.0, 1, 0.0, 0.0]",0,0,0,0,0,-0.146848,0


In [14]:
outcomes_train_df.to_csv(path_or_buf='outcomes_data_train.csv',index=False)
outcomes_test_df.to_csv(path_or_buf='outcomes_data_test.csv',index=False)