# Titanic-DS
## Train Model

In [None]:
# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Import 3rd party libraries
import os
import sys
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Local imports
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
from titanic.data.type_conversions import sex_type_conversion, embarked_type_conversion

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Import Dataset

In [5]:
# Root project path
root_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Load csv file
data = pd.read_csv(os.path.join(root_path, 'data', 'raw', 'titanic_data.csv'))

# Make column names lower case
data.columns = [col.lower() for col in data.columns]

# View DataFrame
data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# View data information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
passengerid    891 non-null int64
survived       891 non-null int64
pclass         891 non-null int64
name           891 non-null object
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
ticket         891 non-null object
fare           891 non-null float64
cabin          204 non-null object
embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [7]:
# Print missing data
data.isnull().sum()

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

In [8]:
# List columns to drop
drop_cols = ['passengerid', 'name', 'ticket' 'cabin']

# Drop columns
data = data[[col for col in data.columns if not any(col in drop_col for drop_col in drop_cols)]]

# View DataFrame
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Type Conversions

In [9]:
# Sex
data['sex'] = data['sex'].map(lambda sex: sex_type_conversion(sex=sex))

# Embarked
data['embarked'] = data['embarked'].map(lambda embarked: embarked_type_conversion(embarked=embarked))

# View DataFrame
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,0,22.0,1,0,7.25,0.0
1,1,1,1,38.0,1,0,71.2833,1.0
2,1,3,1,26.0,0,0,7.925,0.0
3,1,1,1,35.0,1,0,53.1,0.0
4,0,3,0,35.0,0,0,8.05,0.0


# Collect Features for Training

In [10]:
# Feature set
X = data.loc[:, [col for col in data.columns if col != 'survived']]

# View DataFrame
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,3,0,22.0,1,0,7.25,0.0
1,1,1,38.0,1,0,71.2833,1.0
2,3,1,26.0,0,0,7.925,0.0
3,1,1,35.0,1,0,53.1,0.0
4,3,0,35.0,0,0,8.05,0.0


In [11]:
# Training target
y = data.loc[:, ['survived']]

# View DataFrame
y.head()

Unnamed: 0,survived
0,0
1,1
2,1
3,1
4,0


In [12]:
# Split for testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Train Model

In [13]:
# Fit imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X)

# Get classifier
model = RandomForestClassifier(n_estimators=100)

# Train classifier
model.fit(imputer.transform(X_train), y_train)

# Get prediction on test dataset
y_pred = model.predict(imputer.transform(X_test))

# Print model test accuracy
print('Model accuracy is {} %'.format(np.round(accuracy_score(y_test, y_pred) * 100), 2))

Model accuracy is 83.0 %


# Save Model

In [15]:
# Save path
save_path = os.path.join(os.getcwd(), 'models', 'production')

# Pickle model
pickle.dump(model, open(os.path.join(save_path, 'model.pickle'), 'wb'))