In [1]:
import pandas as pd
import numpy as np

### Read Data from Titanic Dataset and Clean

In [2]:
train_path = '../raw_titanic/train.csv'
test_path = '../raw_titanic/test.csv'
test_surv_path = '../raw_titanic/gender_submission.csv'
train = pd.read_csv(train_path,
                    engine='python')
cols = train.columns

In [3]:
test = pd.read_csv(test_path,
                   engine='python')
test_surv = pd.read_csv(test_surv_path,
                        engine='python')
test = pd.merge(test, test_surv,
                on='PassengerId',
                how='outer')[cols]

In [4]:
t = train.append(test)

### Drop No Value-Add Columns

**Cabin** is only useful to obtain the Deck.

In [5]:
t['Deck'] = t['Cabin'].str.extract(r'([A-Z])?(\d)')[0]

**Ticket**, **Name**, **PassengerID** are all not useful.

In [6]:
t.reset_index(drop=True)
t['idx'] = t.index
t = t[['idx','Pclass','Sex','Age','SibSp','Parch','Fare','Deck','Embarked','Survived']].copy()
t.columns = ['idx','class','sex','age','sibs','par/ch','fare','deck','embarked','survived']

### Deal with nan values

Fill nan values except for **deck** and **embarked** with the mean

In [7]:
t['age'] = t['age'].fillna(t['age'].mean())
t['fare'] = t['fare'].fillna(t['fare'].mean())

Fill nan values of **embarked** with the most common value, S

In [8]:
t['embarked'] = t['embarked'].fillna('S')

Fill Deck with 0

In [9]:
t['deck'] = t['deck'].fillna(0)

### Convert data to numerics

In [10]:
t['class raw'] = t['class']
t['sex raw'] = t['sex']
t['par/ch raw'] = t['par/ch']
t['fare raw'] = t['fare']
t['deck raw'] = t['deck']
t['embarked raw'] = t['embarked']
t['survived raw'] = t['survived']

In [11]:
t['class'] = t['class'].astype('category')
t['class'] = t['class'].cat.codes
t[['class','class raw','idx']].groupby(['class','class raw']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,idx
class,class raw,Unnamed: 2_level_1
0,1,323
1,2,277
2,3,709


In [12]:
t['sex'] = t['sex'].astype('category')
t['sex'] = t['sex'].cat.codes
t[['sex','sex raw','idx']].groupby(['sex','sex raw']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,idx
sex,sex raw,Unnamed: 2_level_1
0,female,466
1,male,843


In [13]:
t['deck'] = t['deck'].astype('category')
t['deck'] = t['deck'].cat.codes
t[['deck','deck raw','idx']].groupby(['deck','deck raw']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,idx
deck,deck raw,Unnamed: 2_level_1
0,0,1020
1,A,22
2,B,65
3,C,94
4,D,42
5,E,44
6,F,13
7,G,9


In [14]:
t['embarked'] = t['embarked'].astype('category')
t['embarked'] = t['embarked'].cat.codes
t[['embarked','embarked raw','idx']].groupby(['embarked','embarked raw']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,idx
embarked,embarked raw,Unnamed: 2_level_1
0,C,270
1,Q,123
2,S,916


In [15]:
t['survived'] = t['survived'].astype('category')
t['survived'] = t['survived'].cat.codes
t[['survived','survived raw','idx']].groupby(['survived','survived raw']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,idx
survived,survived raw,Unnamed: 2_level_1
0,0,815
1,1,494


In [16]:
t = t[['idx','class','sex','age','sibs','par/ch','fare','deck','embarked','survived']].copy()
print(t.shape)
t.head()

(1309, 10)


Unnamed: 0,idx,class,sex,age,sibs,par/ch,fare,deck,embarked,survived
0,0,2,1,22.0,1,0,7.25,0,2,0
1,1,0,0,38.0,1,0,71.2833,3,0,1
2,2,2,0,26.0,0,0,7.925,0,2,1
3,3,0,0,35.0,1,0,53.1,3,2,1
4,4,2,1,35.0,0,0,8.05,0,2,0


In [17]:
#t.to_csv('../clean/titanic_clean.csv',
#         index=False)

### Normalize Age and Fare

In [18]:
print('   Mean Age = ' + str(t['age'].mean()))
print('Std Dev Age = ' + str(t['age'].std()))
t['age'] = (t['age'] - t['age'].mean()) / t['age'].std()

   Mean Age = 29.881137667304014
Std Dev Age = 12.883193243701994


In [19]:
print('   Mean fare = ' + str(t['fare'].mean()))
print('Std Dev fare = ' + str(t['fare'].std()))
t['fare'] = (t['fare'] - t['fare'].mean()) / t['fare'].std()

   Mean fare = 33.295479281345564
Std Dev fare = 51.73887903247135


In [20]:
t.head()

Unnamed: 0,idx,class,sex,age,sibs,par/ch,fare,deck,embarked,survived
0,0,2,1,-0.611738,1,0,-0.503402,0,2,0
1,1,0,0,0.63019,1,0,0.734222,3,0,1
2,2,2,0,-0.301256,0,0,-0.490356,0,2,1
3,3,0,0,0.397329,1,0,0.382778,3,2,1
4,4,2,1,0.397329,0,0,-0.48794,0,2,0


In [21]:
t['class'].value_counts()

2    709
0    323
1    277
Name: class, dtype: int64

In [22]:
#t.to_csv('../clean/titanic_clean_norm.csv',
#         index=False)

### One-Hot (Dummy) Encode Categoricals

In [23]:
print(t.shape)
t = pd.get_dummies(t,
                   columns=['deck',
                            'embarked'],
                   drop_first=True)
t = t[['idx', 'class', 'sex', 'age', 'sibs', 'par/ch', 'fare',
       'deck_1', 'deck_2', 'deck_3', 'deck_4', 'deck_5', 'deck_6', 'deck_7',
       'embarked_1', 'embarked_2', 'survived']].copy()
print(t.shape)

(1309, 10)
(1309, 17)


In [24]:
t.to_csv('../clean/titanic_clean_norm_onehot.csv',
         index=False)