# Classification of the titanic dataset 

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, make_scorer, log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import preprocessing
from scipy.stats import randint as sp_randint
import seaborn as sns
import matplotlib.pyplot as plt

from functools import partial

# Read in the data

In [2]:
raw_data = pd.read_csv('titanic_data.csv')

# Select the features to use

We know from the EDA work (notebook 01), that the following features are likely to be predictive. 

In [3]:
data = raw_data[['Survived', 'Pclass','Sex',
                 'Age','SibSp', 'Parch', 'Cabin',
                 'Fare', 'Name', 'Embarked', 'Ticket']]
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Fare,Name,Embarked,Ticket
0,0,3,male,22.0,1,0,,7.25,"Braund, Mr. Owen Harris",S,A/5 21171
1,1,1,female,38.0,1,0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",C,PC 17599
2,1,3,female,26.0,0,0,,7.925,"Heikkinen, Miss. Laina",S,STON/O2. 3101282
3,1,1,female,35.0,1,0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",S,113803
4,0,3,male,35.0,0,0,,8.05,"Allen, Mr. William Henry",S,373450


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Cabin       204 non-null object
Fare        891 non-null float64
Name        891 non-null object
Embarked    889 non-null object
Ticket      891 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


#### cleaning issues
- the sex column needs to be changed into a numerical form.
- Age, cabin, and embarked have some missing values that need to be dealt with

#### feature engineering
- extract title from Name
- get deck from cabin
- family size using SibSp and Parch
- ticket frequency to capture information about groups

In [5]:
# Change Sex to a binary
data['Sex'] = data['Sex'].replace(
    {'male': 1, 'female':0});

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
# fill missing values for embarked by the most frequent value
data['Embarked'] = data['Embarked'].fillna('S');

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [7]:
# feature engineering: we will add a variable called family
data['family'] = data['Parch'] + data['SibSp']

# generate bins for different sized families
def family_bin(family):
    if family > 5:
        return 'large'
    elif family > 3:
        return 'medium'
    elif family > 0:
        return 'small'
    else:
        return 'alone'
        

data['family'] = data['family'].apply(family_bin)

# extract deck from cabin, where there is a null value replace is with u for unknown
data['Cabin'].fillna('U', inplace=True)
data['Deck'] = data['Cabin'].apply(lambda x: x[0])

# we will also add a variable called ticket frequency, to capture information about groups of tickets
data['Ticket_Frequency'] = data.groupby('Ticket')['Ticket'].transform('count')


data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Fare,Name,Embarked,Ticket,family,Deck,Ticket_Frequency
0,0,3,1,22.0,1,0,U,7.25,"Braund, Mr. Owen Harris",S,A/5 21171,small,U,1
1,1,1,0,38.0,1,0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",C,PC 17599,small,C,1
2,1,3,0,26.0,0,0,U,7.925,"Heikkinen, Miss. Laina",S,STON/O2. 3101282,alone,U,1
3,1,1,0,35.0,1,0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",S,113803,small,C,2
4,0,3,1,35.0,0,0,U,8.05,"Allen, Mr. William Henry",S,373450,alone,U,1


In [8]:
# we can use the name feature to extract each persons title, this will not only give us gender information, but also information about marrage, status, and age.
data['Title'] = data.Name.str.extract(
    ' ([A-Za-z]+)\.', expand=False)

titles = {
    'Lady': 'Rare',
    'Countess': 'Rare',
    'Capt': 'Rare',
    'Col': 'Rare',
    'Don': 'Rare',
    'Dr': 'Rare',
    'Major': 'Rare',
    'Rev': 'Rare',
    'Sir': 'Rare',
    'Jonkheer': 'Rare',
    'Dona': 'Rare',
    'Mlle': 'Miss',
    'Ms': 'Miss',
    'Mme': 'Mrs',
}

# extract titles
data['Title'].replace(titles, inplace=True)

In [9]:
# we will not just replace age with the median value
# we will group by other properties to find a set of more meaningful median values
data_grouped = data.groupby(['Pclass', 'Sex', 'Title'])
data_grouped = data_grouped.median(skipna=None)
data_grouped = data_grouped.reset_index()[['Pclass', 'Sex', 'Title', 'Age']]
data_grouped

Unnamed: 0,Pclass,Sex,Title,Age
0,1,0,Miss,30.0
1,1,0,Mrs,40.0
2,1,0,Rare,48.0
3,1,1,Master,4.0
4,1,1,Mr,40.0
5,1,1,Rare,49.0
6,2,0,Miss,24.0
7,2,0,Mrs,32.0
8,2,1,Master,1.0
9,2,1,Mr,31.0


In [10]:
# we can see the median varies a lot when assosiated when we could by other features

In [11]:
def fill_age(grouped, row):
    c = row['Pclass']
    s = row['Sex']
    t = row['Title']
    age = grouped.loc[(grouped['Pclass'] == c) &
                       (grouped['Sex'] == s) & 
                       (grouped['Title'] == t), 'Age'].values[0]
    
    return age

fill_age_partial = partial(fill_age, data_grouped)

data['Age']  = data.apply(lambda x: fill_age_partial(x) if np.isnan(x['Age']) else x['Age'], axis=1)

In [12]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Fare,Name,Embarked,Ticket,family,Deck,Ticket_Frequency,Title
0,0,3,1,22.0,1,0,U,7.25,"Braund, Mr. Owen Harris",S,A/5 21171,small,U,1,Mr
1,1,1,0,38.0,1,0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",C,PC 17599,small,C,1,Mrs
2,1,3,0,26.0,0,0,U,7.925,"Heikkinen, Miss. Laina",S,STON/O2. 3101282,alone,U,1,Miss
3,1,1,0,35.0,1,0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",S,113803,small,C,2,Mrs
4,0,3,1,35.0,0,0,U,8.05,"Allen, Mr. William Henry",S,373450,alone,U,1,Mr


In [13]:
# drop columns we no longer need
data.drop(['Parch', 'Sex', 'SibSp',
           'Name', 'Cabin',
           'Ticket'],
          axis=1, inplace=True)

In [14]:
data.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Embarked,family,Deck,Ticket_Frequency,Title
0,0,3,22.0,7.25,S,small,U,1,Mr
1,1,1,38.0,71.2833,C,small,C,1,Mrs
2,1,3,26.0,7.925,S,alone,U,1,Miss
3,1,1,35.0,53.1,S,small,C,2,Mrs
4,0,3,35.0,8.05,S,alone,U,1,Mr


In [15]:
# one hot encode our catagorical variables
data = pd.get_dummies(data)
data.drop([ 'Deck_U'], axis=1, inplace=True)

In [16]:
data.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Ticket_Frequency,Embarked_C,Embarked_Q,Embarked_S,family_alone,family_large,...,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,3,22.0,7.25,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,38.0,71.2833,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,3,26.0,7.925,1,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0
3,1,1,35.0,53.1,2,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,3,35.0,8.05,1,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


In [17]:
# split our data into a training and testing set
y = data['Survived']
X = data.drop(['Survived'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# model hypertunning

In [18]:
# Number of trees in random forest
n_estimators = sp_randint(1, 1000)

#Number of features to consider at every split
max_features = sp_randint(1, 10)

# Minimum number of samples required to split a node

min_samples_split = sp_randint(2,100)

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
search_params = {'n_estimators': n_estimators,
                 'max_features': max_features,
                 'min_samples_split': min_samples_split,
                 'bootstrap': bootstrap}

In [None]:
grid_search=RandomizedSearchCV(RandomForestClassifier(),
                               n_iter=100,
                               param_distributions=search_params,
                               cv = 3,
                               n_jobs=-1)

In [None]:
grid_search.fit(X_train,y_train);

In [None]:
print('The best Params to use are \n', grid_search.best_params_ )

print('Which gives a best cross validation score of',grid_search.best_score_)

In [None]:
best_params = grid_search.best_params_

random_forest = RandomForestClassifier(**best_params)


best_model = grid_search.best_estimator_
                        
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))

### understanding your model

In [None]:
# we can look at the feature importance to get some insight about how our model is using features
fig, ax = plt.subplots(figsize=(20,10))
ax.bar

importances = best_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in best_model.estimators_],
             axis=0)

# indices = np.argsort(importances)[::-1]
indices = np.argsort(importances)

# Plot the feature importances of the forest

ax.set_title("Feature importances", fontsize=30)
ax.barh(X_train.columns[indices], importances[indices],
        color="r", yerr=std[indices], align="center")
ax.set_ylabel('Feature',fontsize=30)
ax.tick_params(labelsize=15)