# Read data

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [25]:
train_data_orig = pd.read_csv('../data/titanic/train.csv')
test_data_orig = pd.read_csv('../data/titanic/test.csv')

In [26]:
train_data_orig.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
131,132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20.0,0,0,SOTON/O.Q. 3101307,7.05,,S
637,638,0,2,"Collyer, Mr. Harvey",male,31.0,1,1,C.A. 31921,26.25,,S
168,169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S
306,307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C
206,207,0,3,"Backstrom, Mr. Karl Alfred",male,32.0,1,0,3101278,15.85,,S


In [27]:
test_data_orig.sample(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
123,1015,3,"Carver, Mr. Alfred John",male,28.0,0,0,392095,7.25,,S
26,918,1,"Ostby, Miss. Helene Ragnhild",female,22.0,0,1,113509,61.9792,B36,C
58,950,3,"Davison, Mr. Thomas Henry",male,,1,0,386525,16.1,,S
98,990,3,"Braf, Miss. Elin Ester Maria",female,20.0,0,0,347471,7.8542,,S
385,1277,2,"Herman, Miss. Kate",female,24.0,1,2,220845,65.0,,S


In [28]:
# Remove Cabin, Fare, Name
train_data = train_data_orig.drop(['Cabin', 'Fare', 'Name'], axis=1)
test_data = test_data_orig.drop(['Cabin', 'Fare', 'Name'], axis=1)

In [29]:
def label_age(row):
    if np.isnan(row.Age):
        return 'missing'
    if row.Age<20:
        return 'young'
    if row.Age<40:
        return 'normal'
    return 'old'    

In [30]:
train_data['age_class'] = train_data.apply(label_age, axis=1)
test_data['age_class'] = test_data.apply(label_age, axis=1)

In [31]:
len(train_data)

891

In [32]:
# Missing data in the training set
train_data.isnull().sum(axis = 0)

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Embarked         2
age_class        0
dtype: int64

In [33]:
len(test_data)

418

In [34]:
# Missing data in the test set
test_data.isnull().sum(axis = 0)

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Embarked        0
age_class       0
dtype: int64

# Pclass

In [35]:
train_data = pd.concat([train_data, pd.get_dummies(train_data.Pclass, prefix='pclass_')], axis=1).drop(['Pclass'], axis=1)

In [36]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Ticket,Embarked,age_class,pclass__1,pclass__2,pclass__3
0,1,0,male,22.0,1,0,A/5 21171,S,normal,0,0,1
1,2,1,female,38.0,1,0,PC 17599,C,normal,1,0,0
2,3,1,female,26.0,0,0,STON/O2. 3101282,S,normal,0,0,1
3,4,1,female,35.0,1,0,113803,S,normal,1,0,0
4,5,0,male,35.0,0,0,373450,S,normal,0,0,1


In [37]:
test_data = pd.concat([test_data, pd.get_dummies(test_data.Pclass, prefix='pclass_')], axis=1).drop(['Pclass'], axis=1)

In [38]:
test_data.head()

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Ticket,Embarked,age_class,pclass__1,pclass__2,pclass__3
0,892,male,34.5,0,0,330911,Q,normal,0,0,1
1,893,female,47.0,1,0,363272,S,old,0,0,1
2,894,male,62.0,0,0,240276,Q,old,0,1,0
3,895,male,27.0,0,0,315154,S,normal,0,0,1
4,896,female,22.0,1,1,3101298,S,normal,0,0,1


# Sex

In [39]:
train_data = pd.concat([train_data, pd.get_dummies(train_data.Sex, prefix='sex_')], axis=1).drop(['Sex'], axis=1)

In [40]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Ticket,Embarked,age_class,pclass__1,pclass__2,pclass__3,sex__female,sex__male
0,1,0,22.0,1,0,A/5 21171,S,normal,0,0,1,0,1
1,2,1,38.0,1,0,PC 17599,C,normal,1,0,0,1,0
2,3,1,26.0,0,0,STON/O2. 3101282,S,normal,0,0,1,1,0
3,4,1,35.0,1,0,113803,S,normal,1,0,0,1,0
4,5,0,35.0,0,0,373450,S,normal,0,0,1,0,1


In [41]:
test_data = pd.concat([test_data, pd.get_dummies(test_data.Sex, prefix='sex_')], axis=1).drop(['Sex'], axis=1)

In [42]:
test_data.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Ticket,Embarked,age_class,pclass__1,pclass__2,pclass__3,sex__female,sex__male
0,892,34.5,0,0,330911,Q,normal,0,0,1,0,1
1,893,47.0,1,0,363272,S,old,0,0,1,1,0
2,894,62.0,0,0,240276,Q,old,0,1,0,0,1
3,895,27.0,0,0,315154,S,normal,0,0,1,0,1
4,896,22.0,1,1,3101298,S,normal,0,0,1,1,0


# Embarked

In [43]:
# Embarked is missing from only two passengers, ignoring this
train_data = pd.concat([train_data, pd.get_dummies(train_data.Embarked, prefix='from_')], axis=1).drop(['Embarked'], axis=1)

In [44]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Ticket,age_class,pclass__1,pclass__2,pclass__3,sex__female,sex__male,from__C,from__Q,from__S
0,1,0,22.0,1,0,A/5 21171,normal,0,0,1,0,1,0,0,1
1,2,1,38.0,1,0,PC 17599,normal,1,0,0,1,0,1,0,0
2,3,1,26.0,0,0,STON/O2. 3101282,normal,0,0,1,1,0,0,0,1
3,4,1,35.0,1,0,113803,normal,1,0,0,1,0,0,0,1
4,5,0,35.0,0,0,373450,normal,0,0,1,0,1,0,0,1


In [45]:
test_data = pd.concat([test_data, pd.get_dummies(test_data.Embarked, prefix='from_')], axis=1).drop(['Embarked'], axis=1)

In [46]:
test_data.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Ticket,age_class,pclass__1,pclass__2,pclass__3,sex__female,sex__male,from__C,from__Q,from__S
0,892,34.5,0,0,330911,normal,0,0,1,0,1,0,1,0
1,893,47.0,1,0,363272,old,0,0,1,1,0,0,0,1
2,894,62.0,0,0,240276,old,0,1,0,0,1,0,1,0
3,895,27.0,0,0,315154,normal,0,0,1,0,1,0,0,1
4,896,22.0,1,1,3101298,normal,0,0,1,1,0,0,0,1


# Age

In [47]:
train_data = pd.concat(
    [train_data, pd.get_dummies(train_data.age_class, prefix='age_')], axis=1).drop(['Age','age_class'], axis=1)

In [48]:
train_data.head()

Unnamed: 0,PassengerId,Survived,SibSp,Parch,Ticket,pclass__1,pclass__2,pclass__3,sex__female,sex__male,from__C,from__Q,from__S,age__missing,age__normal,age__old,age__young
0,1,0,1,0,A/5 21171,0,0,1,0,1,0,0,1,0,1,0,0
1,2,1,1,0,PC 17599,1,0,0,1,0,1,0,0,0,1,0,0
2,3,1,0,0,STON/O2. 3101282,0,0,1,1,0,0,0,1,0,1,0,0
3,4,1,1,0,113803,1,0,0,1,0,0,0,1,0,1,0,0
4,5,0,0,0,373450,0,0,1,0,1,0,0,1,0,1,0,0


In [49]:
test_data = pd.concat(
    [test_data, pd.get_dummies(test_data.age_class, prefix='age_')], axis=1).drop(['Age','age_class'], axis=1)

In [50]:
test_data.head()

Unnamed: 0,PassengerId,SibSp,Parch,Ticket,pclass__1,pclass__2,pclass__3,sex__female,sex__male,from__C,from__Q,from__S,age__missing,age__normal,age__old,age__young
0,892,0,0,330911,0,0,1,0,1,0,1,0,0,1,0,0
1,893,1,0,363272,0,0,1,1,0,0,0,1,0,0,1,0
2,894,0,0,240276,0,1,0,0,1,0,1,0,0,0,1,0
3,895,0,0,315154,0,0,1,0,1,0,0,1,0,1,0,0
4,896,1,1,3101298,0,0,1,1,0,0,0,1,0,1,0,0


# Ticket

In [51]:
pd.get_dummies(train_data.Ticket)

Unnamed: 0,110152,110413,110465,110564,110813,111240,111320,111361,111369,111426,...,STON/O2. 3101290,SW/PP 751,W./C. 14258,W./C. 14263,W./C. 6607,W./C. 6608,W./C. 6609,W.E.P. 5734,W/C 14208,WE/P 5735
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
887,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
888,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
889,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
