# kaggle Titanic Dataset

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [3]:
PATH = '../data/titanic'

In [4]:
!ls {PATH}

test.csv  train.csv


In [13]:
# Load the dataset
train = pd.read_csv(f'{PATH}/train.csv', index_col='PassengerId')
test = pd.read_csv(f'{PATH}/test.csv', index_col='PassengerId')
print(train.shape)
print(test.shape)
train.head().T

(891, 11)
(418, 10)


PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05
Cabin,,C85,,C123,


In [14]:
# save dependent label and drop from training set
y = train['Survived']
train.drop('Survived', axis=1, inplace=True)

print(train.shape)
print(y.shape)

(891, 10)
(891,)


In [15]:
# combine train and test sets for processing
full = pd.concat([train, test])
print(full.shape)
full.tail().T

(1309, 10)


PassengerId,1305,1306,1307,1308,1309
Pclass,3,1,3,3,3
Name,"Spector, Mr. Woolf","Oliva y Ocana, Dona. Fermina","Saether, Mr. Simon Sivertsen","Ware, Mr. Frederick","Peter, Master. Michael J"
Sex,male,female,male,male,male
Age,,39,38.5,,
SibSp,0,0,0,0,1
Parch,0,0,0,0,1
Ticket,A.5. 3236,PC 17758,SOTON/O.Q. 3101262,359309,2668
Fare,8.05,108.9,7.25,8.05,22.3583
Cabin,,C105,,,
Embarked,S,C,S,S,C


In [16]:
full.dtypes

Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [17]:
# drop 'Name' and 'Ticket' columns
data = full.copy()
data.drop(['Name', 'Ticket'], axis=1, inplace=True)

data.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked'], dtype='object')

In [18]:
# convert string columns into categories using fastai 'train_cats' method
train_cats(data)

In [19]:
data.dtypes

Pclass         int64
Sex         category
Age          float64
SibSp          int64
Parch          int64
Fare         float64
Cabin       category
Embarked    category
dtype: object

In [20]:
data.head().T

PassengerId,1,2,3,4,5
Pclass,3,1,3,1,3
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Fare,7.25,71.2833,7.925,53.1,8.05
Cabin,,C85,,C123,
Embarked,S,C,S,S,S


In [21]:
# determine the percentage of missing values
data.isnull().sum().sort_index()/len(train)

Age         0.295174
Cabin       1.138047
Embarked    0.002245
Fare        0.001122
Parch       0.000000
Pclass      0.000000
Sex         0.000000
SibSp       0.000000
dtype: float64

In [23]:
# impute the missing values using fastai 'proc_df' method
df, _, _ = proc_df(data)

print(df.shape)
df.head().T

(1309, 10)


PassengerId,1,2,3,4,5
Pclass,3,1,3,1,3
Sex,2,1,1,1,2
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Fare,7.25,71.2833,7.925,53.1,8.05
Cabin,0,107,0,71,0
Embarked,3,1,3,3,3
Age_na,False,False,False,False,False
Fare_na,False,False,False,False,False


In [24]:
# separate train and test sets
train_subset = df[:891]
test_subset = df[891:]

print(train_subset.shape)
print(test_subset.shape)

(891, 10)
(418, 10)


## Train model and make predictions

In [25]:
from sklearn.metrics import mean_squared_error

m = RandomForestClassifier(n_jobs=-1, n_estimators=100, random_state=42)
m.fit(train_subset, y)
pred = m.predict(train_subset)

mse = mean_squared_error(y, pred)
print('RMSE {:.2f}'.format(np.sqrt(mse)))

RMSE 0.12


In [30]:
test.index

Int64Index([ 892,  893,  894,  895,  896,  897,  898,  899,  900,  901,
            ...
            1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309],
           dtype='int64', name='PassengerId', length=418)

In [31]:
# test predictions
test_pred = m.predict(test_subset)

# create submission file
df = pd.DataFrame()
df['PassengerId'] = test.index
df['Survived'] = test_pred
df.to_csv('./predictions/test_results.csv', index=False)

Submission scored 0.74162