In [1]:
# Import appropriate libraries and packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff

In [2]:
# Read in the data

data = arff.loadarff('../Springboard/CAPSTONE/Data/ThoracicSurgery.arff')
df = pd.DataFrame(data[0])

df.head()

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,b'DGN2',2.88,2.16,b'PRZ1',b'F',b'F',b'F',b'T',b'T',b'OC14',b'F',b'F',b'F',b'T',b'F',60.0,b'F'
1,b'DGN3',3.4,1.88,b'PRZ0',b'F',b'F',b'F',b'F',b'F',b'OC12',b'F',b'F',b'F',b'T',b'F',51.0,b'F'
2,b'DGN3',2.76,2.08,b'PRZ1',b'F',b'F',b'F',b'T',b'F',b'OC11',b'F',b'F',b'F',b'T',b'F',59.0,b'F'
3,b'DGN3',3.68,3.04,b'PRZ0',b'F',b'F',b'F',b'F',b'F',b'OC11',b'F',b'F',b'F',b'F',b'F',54.0,b'F'
4,b'DGN3',2.44,0.96,b'PRZ2',b'F',b'T',b'F',b'T',b'T',b'OC11',b'F',b'F',b'F',b'T',b'F',73.0,b'T'


In [3]:
df.columns

Index(['DGN', 'PRE4', 'PRE5', 'PRE6', 'PRE7', 'PRE8', 'PRE9', 'PRE10', 'PRE11',
       'PRE14', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32', 'AGE', 'Risk1Yr'],
      dtype='object')

In [4]:
df.shape

(470, 17)

In [5]:
# Function to decode the data (remove the byte info 'b')

def apply_decode(df_name):
    for col_name in df_name.columns:
        if df_name[col_name].dtype != 'float64':
            df_name[col_name] = df_name[col_name].apply(lambda col: col.decode('utf-8'))
    return df_name

In [6]:
# Apply decoding to data 

apply_decode(df)

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,DGN2,2.88,2.16,PRZ1,F,F,F,T,T,OC14,F,F,F,T,F,60.0,F
1,DGN3,3.40,1.88,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51.0,F
2,DGN3,2.76,2.08,PRZ1,F,F,F,T,F,OC11,F,F,F,T,F,59.0,F
3,DGN3,3.68,3.04,PRZ0,F,F,F,F,F,OC11,F,F,F,F,F,54.0,F
4,DGN3,2.44,0.96,PRZ2,F,T,F,T,T,OC11,F,F,F,T,F,73.0,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,DGN2,3.88,2.12,PRZ1,F,F,F,T,F,OC13,F,F,F,T,F,63.0,F
466,DGN3,3.76,3.12,PRZ0,F,F,F,F,F,OC11,F,F,F,T,F,61.0,F
467,DGN3,3.04,2.08,PRZ1,F,F,F,T,F,OC13,F,F,F,F,F,52.0,F
468,DGN3,1.96,1.68,PRZ1,F,F,F,T,T,OC12,F,F,F,T,F,79.0,F


In [7]:
# Replace the Boolean values with T=1 , F=0

df.replace('F', 0, inplace=True)
df.replace('T', 1, inplace=True)
df.head()

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,DGN2,2.88,2.16,PRZ1,0,0,0,1,1,OC14,0,0,0,1,0,60.0,0
1,DGN3,3.4,1.88,PRZ0,0,0,0,0,0,OC12,0,0,0,1,0,51.0,0
2,DGN3,2.76,2.08,PRZ1,0,0,0,1,0,OC11,0,0,0,1,0,59.0,0
3,DGN3,3.68,3.04,PRZ0,0,0,0,0,0,OC11,0,0,0,0,0,54.0,0
4,DGN3,2.44,0.96,PRZ2,0,1,0,1,1,OC11,0,0,0,1,0,73.0,1


In [8]:
# Importing OrdinalEncoder

from sklearn.preprocessing import OrdinalEncoder

# Encode the ordinal variable columns (PRE6 & PRE14)
# Create ordered categories for the respective var col

PRE6_Categories = ['PRZ0','PRZ1','PRZ2']
PRE14_Categories = ['OC11','OC12','OC13','OC14']

# Instantiate the OrdinalEncoder() w/ the above categories plugged into
# the category parameter

encoder = OrdinalEncoder(categories=[PRE6_Categories, PRE14_Categories])

# Fit the encoder to the data

encoder.fit(df[['PRE6', 'PRE14']])

# and now transform

df[['PRE6', 'PRE14']] = encoder.transform(df[['PRE6', 'PRE14']])

In [9]:
# Verify the ordinal rankings were mapped correctly to the data

df.head()

Unnamed: 0,DGN,PRE4,PRE5,PRE6,PRE7,PRE8,PRE9,PRE10,PRE11,PRE14,PRE17,PRE19,PRE25,PRE30,PRE32,AGE,Risk1Yr
0,DGN2,2.88,2.16,1.0,0,0,0,1,1,3.0,0,0,0,1,0,60.0,0
1,DGN3,3.4,1.88,0.0,0,0,0,0,0,1.0,0,0,0,1,0,51.0,0
2,DGN3,2.76,2.08,1.0,0,0,0,1,0,0.0,0,0,0,1,0,59.0,0
3,DGN3,3.68,3.04,0.0,0,0,0,0,0,0.0,0,0,0,0,0,54.0,0
4,DGN3,2.44,0.96,2.0,0,1,0,1,1,0.0,0,0,0,1,0,73.0,1


In [10]:
# Creating dummy variables for the categorical columns:
# DGN, PRE7, PRE8, PRE9, PRE10, PRE11, PRE17, PRE19, PRE25, PRE30, PRE32

dummy = pd.get_dummies(data=df.drop(columns=['PRE4','PRE5','PRE6','PRE14','AGE','Risk1Yr']), 
                                    columns = ['DGN', 'PRE7', 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32'])
dummy.head()

Unnamed: 0,DGN_DGN1,DGN_DGN2,DGN_DGN3,DGN_DGN4,DGN_DGN5,DGN_DGN6,DGN_DGN8,PRE7_0,PRE7_1,PRE8_0,...,PRE17_0,PRE17_1,PRE19_0,PRE19_1,PRE25_0,PRE25_1,PRE30_0,PRE30_1,PRE32_0,PRE32_1
0,0,1,0,0,0,0,0,1,0,1,...,1,0,1,0,1,0,0,1,1,0
1,0,0,1,0,0,0,0,1,0,1,...,1,0,1,0,1,0,0,1,1,0
2,0,0,1,0,0,0,0,1,0,1,...,1,0,1,0,1,0,0,1,1,0
3,0,0,1,0,0,0,0,1,0,1,...,1,0,1,0,1,0,1,0,1,0
4,0,0,1,0,0,0,0,1,0,0,...,1,0,1,0,1,0,0,1,1,0


In [11]:
dummy.columns

Index(['DGN_DGN1', 'DGN_DGN2', 'DGN_DGN3', 'DGN_DGN4', 'DGN_DGN5', 'DGN_DGN6',
       'DGN_DGN8', 'PRE7_0', 'PRE7_1', 'PRE8_0', 'PRE8_1', 'PRE9_0', 'PRE9_1',
       'PRE10_0', 'PRE10_1', 'PRE11_0', 'PRE11_1', 'PRE17_0', 'PRE17_1',
       'PRE19_0', 'PRE19_1', 'PRE25_0', 'PRE25_1', 'PRE30_0', 'PRE30_1',
       'PRE32_0', 'PRE32_1'],
      dtype='object')

In [12]:
# Merge the dummy cols with the data

df = pd.concat([dummy, df], axis=1)
df.head()

# Dropping the previously encoded columns

df.drop(columns=['DGN','PRE7', 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32'], axis=1, inplace=True)

In [13]:
df.columns

Index(['DGN_DGN1', 'DGN_DGN2', 'DGN_DGN3', 'DGN_DGN4', 'DGN_DGN5', 'DGN_DGN6',
       'DGN_DGN8', 'PRE7_0', 'PRE7_1', 'PRE8_0', 'PRE8_1', 'PRE9_0', 'PRE9_1',
       'PRE10_0', 'PRE10_1', 'PRE11_0', 'PRE11_1', 'PRE17_0', 'PRE17_1',
       'PRE19_0', 'PRE19_1', 'PRE25_0', 'PRE25_1', 'PRE30_0', 'PRE30_1',
       'PRE32_0', 'PRE32_1', 'PRE4', 'PRE5', 'PRE6', 'PRE14', 'AGE',
       'Risk1Yr'],
      dtype='object')

In [14]:
# Check for null values

df[df.isnull()].count()

DGN_DGN1    0
DGN_DGN2    0
DGN_DGN3    0
DGN_DGN4    0
DGN_DGN5    0
DGN_DGN6    0
DGN_DGN8    0
PRE7_0      0
PRE7_1      0
PRE8_0      0
PRE8_1      0
PRE9_0      0
PRE9_1      0
PRE10_0     0
PRE10_1     0
PRE11_0     0
PRE11_1     0
PRE17_0     0
PRE17_1     0
PRE19_0     0
PRE19_1     0
PRE25_0     0
PRE25_1     0
PRE30_0     0
PRE30_1     0
PRE32_0     0
PRE32_1     0
PRE4        0
PRE5        0
PRE6        0
PRE14       0
AGE         0
Risk1Yr     0
dtype: int64

In [15]:
# Checking for NA values

df[df.isna()].count()

DGN_DGN1    0
DGN_DGN2    0
DGN_DGN3    0
DGN_DGN4    0
DGN_DGN5    0
DGN_DGN6    0
DGN_DGN8    0
PRE7_0      0
PRE7_1      0
PRE8_0      0
PRE8_1      0
PRE9_0      0
PRE9_1      0
PRE10_0     0
PRE10_1     0
PRE11_0     0
PRE11_1     0
PRE17_0     0
PRE17_1     0
PRE19_0     0
PRE19_1     0
PRE25_0     0
PRE25_1     0
PRE30_0     0
PRE30_1     0
PRE32_0     0
PRE32_1     0
PRE4        0
PRE5        0
PRE6        0
PRE14       0
AGE         0
Risk1Yr     0
dtype: int64

This dataset appears to be complete with no null / NA values!