### Setup 

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import shutil
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *
import pandas as pd
import numpy as np
from sklearn import metrics
from fastai.structured import *
from fastai.column_data import *
import warnings

  from numpy.core.umath_tests import inner1d


In [3]:
warnings.filterwarnings('ignore')
torch.cuda.is_available()

True

In [4]:
PATH = '../data/titanic'; 

### Data inspection 

In [5]:
training_df = pd.read_csv(f'{PATH}/train.csv')
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
training_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
test_df = pd.read_csv(f'{PATH}/test.csv')
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


### Feature engineering 

#### Add title 

In [9]:
title_series = training_df.apply(lambda x: x.Name.split(', ')[1].split(' ')[0] , axis=1)
training_df.insert(len(training_df.columns), 'Title', title_series)
test_title_series = test_df.apply(lambda x: x.Name.split(', ')[1].split(' ')[0] , axis=1)
test_df.insert(len(test_df.columns), 'Title', test_title_series)

In [10]:
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.


In [11]:
training_df.Title.value_counts()

Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Major.         2
Col.           2
Mlle.          2
Mme.           1
the            1
Jonkheer.      1
Sir.           1
Don.           1
Lady.          1
Capt.          1
Ms.            1
Name: Title, dtype: int64

In [12]:
training_df[training_df.Title =='the']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
759,760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,0,0,110152,86.5,B77,S,the


In [13]:
def collapse_title(row):
    title = row.Title
    if title in ['Dr.', 'Don.', 'Sir.']:
        return 'Mr.'
    elif title in ['Mlle.', 'the', 'Ms.', 'Lady.']:
        return 'Miss.'
    elif title in ['Mme.']:
        return 'Mrs.'
    elif title in ['Rev.', 'Major.', 'Col.','Jonkheer.', 'Capt.']:
        return 'Mr.' if row.Sex == 'male' else 'Mrs.'
    return title

In [14]:
training_df.Title = training_df.apply(collapse_title, axis=1)

In [15]:
training_df.Title.value_counts()

Mr.        538
Miss.      187
Mrs.       126
Master.     40
Name: Title, dtype: int64

#### Add Family

In [16]:
family_size_series = training_df.apply(lambda x: x.SibSp + x.Parch + 1, axis=1)
training_df.insert(len(training_df.columns), 'Family_count', family_size_series)
test_family_size_series = test_df.apply(lambda x: x.SibSp + x.Parch + 1, axis=1)
test_df.insert(len(test_df.columns), 'Family_count', test_family_size_series)

In [17]:
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family_count
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,1


In [18]:
training_df.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             177
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin           687
Embarked          2
Title             0
Family_count      0
dtype: int64

In [19]:
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family_count
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,1


In [20]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family_count
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr.,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs.,2
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr.,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr.,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs.,3


In [21]:
training_df.to_feather(f'{PATH}/proc_train')
test_df.to_feather(f'{PATH}/proc_test')

In [22]:
training_df = pd.read_feather(f'{PATH}/proc_train');
test_df = pd.read_feather(f'{PATH}/proc_test');

#### Fill age

In [23]:
training_df.insert(len(training_df.columns), 'Is_age_null', training_df.Age.isnull())
test_df.insert(len(test_df.columns), 'Is_age_null', test_df.Age.isnull())

In [24]:
training_df['Age'] = training_df.Age.fillna(training_df.Age.median())
test_df['Age'] = training_df.Age.fillna(training_df.Age.median())

In [25]:
training_df.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin           687
Embarked          2
Title             0
Family_count      0
Is_age_null       0
dtype: int64

In [26]:
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family_count,Is_age_null
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,2,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,2,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,1,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,2,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,1,False


#### Cabin

In [27]:
training_df.Cabin.unique()

array([None, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6', 'C23 C25 C27', 'B78', 'D33', 'B30', 'C52',
       'B28', 'C83', 'F33', 'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101', 'F E69',
       'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4', 'A32', 'B4', 'B80', 'A31', 'D36', 'D15',
       'C93', 'C78', 'D35', 'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19', 'B49', 'D',
       'C22 C26', 'C106', 'C65', 'E36', 'C54', 'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91',
       'E40', 'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44', 'A34', 'C104', 'C111', 'C92',
       'E38', 'D21', 'E12', 'E63', 'A14', 'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41', 'A20', 'D19', 'D50', 'D9',
       'A23', 'B50', 'A26', 'D48', 'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64', 'E24', 'C90', 'C45', 'E8', 'B101'

In [28]:
cabin_header = training_df.apply(lambda x: x.Cabin[0] if x.Cabin else 'Unknown', axis=1)
training_df.insert(len(training_df.columns), 'Cabin_header', cabin_header)
cabin_header = test_df.apply(lambda x: x.Cabin[0] if x.Cabin else 'Unknown', axis=1)
test_df.insert(len(test_df.columns), 'Cabin_header', cabin_header)

In [29]:
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family_count,Is_age_null,Cabin_header
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,2,False,Unknown
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,2,False,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,1,False,Unknown
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,2,False,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,1,False,Unknown


#### Embark

In [30]:
training_df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [31]:
training_df['Embarked'] = training_df.Embarked.fillna(training_df.Embarked.mode()[0])
test_df['Embarked'] = test_df.Embarked.fillna(test_df.Embarked.mode()[0])

In [32]:
training_df.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin           687
Embarked          0
Title             0
Family_count      0
Is_age_null       0
Cabin_header      0
dtype: int64

In [33]:
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family_count,Is_age_null,Cabin_header
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,2,False,Unknown
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,2,False,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,1,False,Unknown
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,2,False,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,1,False,Unknown


In [34]:
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family_count,Is_age_null,Cabin_header
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,2,False,Unknown
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,2,False,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,1,False,Unknown
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,2,False,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,1,False,Unknown


In [35]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family_count,Is_age_null,Cabin_header
0,892,3,"Kelly, Mr. James",male,22.0,0,0,330911,7.8292,,Q,Mr.,1,False,Unknown
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,38.0,1,0,363272,7.0,,S,Mrs.,2,False,Unknown
2,894,2,"Myles, Mr. Thomas Francis",male,26.0,0,0,240276,9.6875,,Q,Mr.,1,False,Unknown
3,895,3,"Wirz, Mr. Albert",male,35.0,0,0,315154,8.6625,,S,Mr.,1,False,Unknown
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,35.0,1,1,3101298,12.2875,,S,Mrs.,3,False,Unknown


In [36]:
training_df.to_feather(f'{PATH}/proc_train')
test_df.to_feather(f'{PATH}/proc_test')

In [37]:
training_df = pd.read_feather(f'{PATH}/proc_train');
test_df = pd.read_feather(f'{PATH}/proc_test');

#### Sex

In [38]:
training_df['Sex'] = training_df['Sex'] == 'male'
test_df['Sex'] = test_df['Sex'] == 'male'

In [39]:
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family_count,Is_age_null,Cabin_header
0,1,0,3,"Braund, Mr. Owen Harris",True,22.0,1,0,A/5 21171,7.25,,S,Mr.,2,False,Unknown
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",False,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,2,False,C
2,3,1,3,"Heikkinen, Miss. Laina",False,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,1,False,Unknown
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",False,35.0,1,0,113803,53.1,C123,S,Mrs.,2,False,C
4,5,0,3,"Allen, Mr. William Henry",True,35.0,0,0,373450,8.05,,S,Mr.,1,False,Unknown


#### Ticket

In [40]:
training_df.Ticket.unique()

array(['A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450', '330877', '17463', '349909',
       '347742', '237736', 'PP 9549', '113783', 'A/5. 2151', '347082', '350406', '248706', '382652',
       '244373', '345763', '2649', '239865', '248698', '330923', '113788', '347077', '2631', '19950',
       '330959', '349216', 'PC 17601', 'PC 17569', '335677', 'C.A. 24579', 'PC 17604', '113789', '2677',
       'A./5. 2152', '345764', '2651', '7546', '11668', '349253', 'SC/Paris 2123', '330958',
       'S.C./A.4. 23567', '370371', '14311', '2662', '349237', '3101295', 'A/4. 39886', 'PC 17572', '2926',
       '113509', '19947', 'C.A. 31026', '2697', 'C.A. 34651', 'CA 2144', '2669', '113572', '36973', '347088',
       'PC 17605', '2661', 'C.A. 29395', 'S.P. 3464', '3101281', '315151', 'C.A. 33111', 'S.O.C. 14879',
       '2680', '1601', '348123', '349208', '374746', '248738', '364516', '345767', '345779', '330932',
       '113059', 'SO/C 14885', '3101278', 'W./C. 6608', 'SOTON/OQ 392086

In [41]:
new_ticket = training_df.apply(lambda x: 'Number' if len(x.Ticket.split(' ')) == 1 else x.Ticket.split(' ')[0], axis=1)
training_df.insert(len(training_df.columns), 'Ticket_header', new_ticket)
test_new_ticket = test_df.apply(lambda x: 'Number' if len(x.Ticket.split(' ')) == 1 else x.Ticket.split(' ')[0], axis=1)
test_df.insert(len(test_df.columns), 'Ticket_header', test_new_ticket)

In [42]:
training_df.Ticket_header.unique()

array(['A/5', 'PC', 'STON/O2.', 'Number', 'PP', 'A/5.', 'C.A.', 'A./5.', 'SC/Paris', 'S.C./A.4.', 'A/4.',
       'CA', 'S.P.', 'S.O.C.', 'SO/C', 'W./C.', 'SOTON/OQ', 'W.E.P.', 'STON/O', 'A4.', 'C', 'SOTON/O.Q.',
       'SC/PARIS', 'S.O.P.', 'A.5.', 'Fa', 'CA.', 'F.C.C.', 'W/C', 'SW/PP', 'SCO/W', 'P/PP', 'SC', 'SC/AH',
       'A/S', 'A/4', 'WE/P', 'S.W./PP', 'S.O./P.P.', 'F.C.', 'SOTON/O2', 'S.C./PARIS', 'C.A./SOTON'],
      dtype=object)

In [43]:
def collapse_ticket_header(row):
    ticket_header = row.Ticket_header
    if ticket_header in ['A/5', 'A/5.', 'A./5.', 'A.5.', 'A/S']:
        return 'A/5'
    elif ticket_header in ['A/4.', 'A4.', 'A/4.']:
        return 'A/4'
    elif ticket_header in ['S.O.C.', 'SO/C']:
        return 'SO/C'
    return ticket_header

In [44]:
training_df['Ticket_header'] = training_df.apply(collapse_ticket_header, axis=1)
test_df['Ticket_header'] = test_df.apply(collapse_ticket_header, axis=1)

In [45]:
training_df = training_df.drop(['Name', 'Ticket', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

In [46]:
training_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,Family_count,Is_age_null,Cabin_header,Ticket_header
0,0,3,True,22.0,1,0,7.25,,S,Mr.,2,False,Unknown,A/5
1,1,1,False,38.0,1,0,71.2833,C85,C,Mrs.,2,False,C,PC
2,1,3,False,26.0,0,0,7.925,,S,Miss.,1,False,Unknown,STON/O2.
3,1,1,False,35.0,1,0,53.1,C123,S,Mrs.,2,False,C,Number
4,0,3,True,35.0,0,0,8.05,,S,Mr.,1,False,Unknown,Number


In [47]:
test_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,Family_count,Is_age_null,Cabin_header,Ticket_header
0,3,True,22.0,0,0,7.8292,,Q,Mr.,1,False,Unknown,Number
1,3,False,38.0,1,0,7.0,,S,Mrs.,2,False,Unknown,Number
2,2,True,26.0,0,0,9.6875,,Q,Mr.,1,False,Unknown,Number
3,3,True,35.0,0,0,8.6625,,S,Mr.,1,False,Unknown,Number
4,3,False,35.0,1,1,12.2875,,S,Mrs.,3,False,Unknown,Number


In [48]:
training_df.to_feather(f'{PATH}/proc_train')
test_df.to_feather(f'{PATH}/proc_test')

In [49]:
training_df = pd.read_feather(f'{PATH}/proc_train');
test_df = pd.read_feather(f'{PATH}/proc_test');

### Deep Learning

In [50]:
cat_vars = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked', 'Title', 'Family_count', 'Is_age_null', 'Cabin_header', 'Ticket_header']
cont_vars = ['Fare', 'Age']

In [51]:
cv_idxs = get_cv_idxs(len(training_df), val_pct=0.2)

In [52]:
for v in cat_vars:
    training_df[v] = training_df[v].astype('category').cat.as_ordered()
apply_cats(test_df, training_df)

In [53]:
for v in cont_vars:
    training_df[v] = training_df[v].astype('float32')
    test_df[v] = test_df[v].astype('float32')
test_df['Survived'] = 0

In [54]:
df, y, nas, mapper = proc_df(training_df, 'Survived', do_scale=True)

In [55]:
test_df, _, test_nas, test_mapper = proc_df(test_df, 'Survived', do_scale=True, mapper=mapper, na_dict=nas)

In [56]:
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,Family_count,Is_age_null,Cabin_header,Ticket_header
0,3,2,-0.565736,2,1,-0.502445,0,3,3,2,1,9,2
1,1,1,0.663861,2,1,0.786845,82,1,4,2,1,3,13
2,3,1,-0.258337,1,1,-0.488854,0,3,2,1,1,9,31
3,1,1,0.433312,2,1,0.42073,56,3,4,2,1,3,11
4,3,2,0.433312,1,1,-0.486337,0,3,3,1,1,9,11


In [57]:
test_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,Family_count,Is_age_null,Cabin_header,Ticket_header,Fare_na
0,3,2,-0.565736,1,1,-0.490783,0,2,3,1,1,9,11,False
1,3,1,0.663861,2,1,-0.507479,0,3,4,2,1,9,11,False
2,2,2,-0.258337,1,1,-0.453367,0,2,3,1,1,9,11,False
3,3,2,0.433312,1,1,-0.474005,0,3,3,1,1,9,11,False
4,3,1,0.433312,2,2,-0.401017,0,3,4,3,1,9,11,False


In [58]:
test_df = test_df.drop('Fare_na', axis=1)

In [59]:
md = ColumnarModelData.from_data_frame(PATH, val_idxs=cv_idxs, df=df, y=y.astype('float32'), cat_flds=cat_vars, test_df=test_df, bs=128)

In [60]:
col_name_freq = [(c, len(training_df[c].cat.categories)) for c in cat_vars]
col_name_freq

[('Pclass', 3),
 ('Sex', 2),
 ('SibSp', 7),
 ('Parch', 7),
 ('Cabin', 147),
 ('Embarked', 3),
 ('Title', 4),
 ('Family_count', 9),
 ('Is_age_null', 2),
 ('Cabin_header', 9),
 ('Ticket_header', 36)]

In [61]:
emb_szs = [(c, min(50, (c+1)//2)) for _, c in col_name_freq]
emb_szs

[(3, 2),
 (2, 1),
 (7, 4),
 (7, 4),
 (147, 50),
 (3, 2),
 (4, 2),
 (9, 5),
 (2, 1),
 (9, 5),
 (36, 18)]

In [62]:
learner = md.get_learner(emb_szs, len(df.columns)-len(cat_vars), 0.04, 1, [1000], [0.01], y_range=[0, 1])

In [63]:
learning_rate = 5e-4

In [64]:
learner.fit(learning_rate, 2)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                                                                                                                                                                    
    0      0.242753   0.229328  
    1      0.230014   0.215239                                                                                                                                                                                    



[0.2152388095855713]

In [65]:
learner.fit(learning_rate, 5, cycle_len=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss                                                                                                                                                                                    
    0      0.209158   0.209326  
    1      0.205466   0.199785                                                                                                                                                                                    
    2      0.20103    0.191548                                                                                                                                                                                    
    3      0.196718   0.184634                                                                                                                                                                                    
    4      0.192262   0.17925                                                                                              

[0.17924964427947998]

In [66]:
learner.save('titanic')

In [67]:
learner.load('titanic')

In [68]:
learner.fit(learning_rate, 4, cycle_len=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=8), HTML(value='')))

epoch      trn_loss   val_loss                                                                                                                                                                                    
    0      0.176596   0.174662  
    1      0.173454   0.172353                                                                                                                                                                                    
    2      0.17056    0.165453                                                                                                                                                                                    
    3      0.168884   0.163957                                                                                                                                                                                    
    4      0.166384   0.159096                                                                                             

[0.15275637805461884]

In [69]:
learner.save('titanic')

In [70]:
learner.load('titanic')

In [71]:
learner.fit(learning_rate, 3, cycle_len=1, cycle_mult=2)

HBox(children=(IntProgress(value=0, description='Epoch', max=7), HTML(value='')))

epoch      trn_loss   val_loss                                                                                                                                                                                    
    0      0.146812   0.150091  
    1      0.143854   0.146794                                                                                                                                                                                    
    2      0.142621   0.146009                                                                                                                                                                                    
    3      0.140516   0.143701                                                                                                                                                                                    
    4      0.138802   0.141718                                                                                             

[0.14052888751029968]

In [72]:
learner.fit(learning_rate, 5, cycle_len=1, cycle_mult=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss                                                                                                                                                                                    
    0      0.132976   0.139462  
    1      0.130952   0.138434                                                                                                                                                                                    
    2      0.128296   0.137945                                                                                                                                                                                    
    3      0.126939   0.13715                                                                                                                                                                                     
    4      0.124465   0.136395                                                                                             

[0.1363946795463562]

In [73]:
learner.save('titanic')

In [74]:
learner.load('titanic')

In [75]:
pred = learner.predict(is_test=True)
pred_sig = np.where(pred > 0.5, 1, 0).flatten()
pred_sig

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 

In [76]:
submit_df = pd.read_csv(f'{PATH}/test.csv')
submit_df.insert(len(submit_df.columns), 'Survived', pred_sig)

In [77]:
submit_df[['PassengerId', 'Survived']].to_csv(f'{PATH}/submit.gz', compression="gzip", index=False)

In [78]:
FileLink(f'{PATH}/submit.gz')