In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [27]:
infile = './data/titanic_train_clean.csv'
outfile = './data/titanic_engineered.csv'

title_codes = {
    'Mr': 1,       # General adult male
    'Mrs': 2,      # General adult female
    'Miss': 3,     # General young female
    'Master': 4,   # General young male
    'Don': 5,      # Noble male
    'Rev': 6,      # Professional
    'Dr': 6,       # Professional
    'Mme': 2,      # General adult female
    'Ms': 2,       # General adult female
    'Major': 6,    # Professional
    'Lady': 7,     # Noble female
    'Sir' : 5,     # Noble male
    'Mlle': 3,     # General young female
    'Col': 6,      # Professional
    'Capt': 6,     # Professional
    'Countess': 7, # Noble female
    'Jonkheer': 5  # Noble male
}

In [17]:
# Simple Model

def test_models(features, label):
    '''
    Runs a simple Logistic Regression Model to measure
    performance.
    '''
    
    model = LogisticRegression() 
    scores = cross_val_score(
        model, 
        features, 
        label, 
        cv=50
    )    
    print('Logistic Regression Performance: {}%'.format(round(scores.mean()*100,2)))
    
    model = SVC()
    scores = cross_val_score(
        model, 
        features, 
        label, 
        cv=50
    )  
    print('SVC Performance: {}%'.format(round(scores.mean()*100,2)))  
    
    model = LinearSVC()
    scores = cross_val_score(
        model, 
        features, 
        label, 
        cv=50
    )  
    print('Linear SVC Performance: {}%'.format(round(scores.mean()*100,2))) 
    model = GaussianNB()
    scores = cross_val_score(
        model, 
        features, 
        label, 
        cv=50
    )  
    print('Naive Bayes Performance: {}%'.format(round(scores.mean()*100,2)))   
    
    model = RandomForestClassifier()
    scores = cross_val_score(
        model, 
        features, 
        label, 
        cv=50
    )  
    print('Random Forest Performance: {}%'.format(round(scores.mean()*100,2)))     
    

In [4]:
df = pd.read_csv(infile)
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,3.0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1.0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,1.0


In [5]:
# Basic variable creation
df['family_size'] = df['sibsp'] + df['parch'] + 1
df.drop('cabin', axis=1, inplace=True)

# Creation of title variable
def title(row):
    title_search = re.search(' ([A-Za-z]+)\.', row['name'])
    # If a title exists, extract and return it
    if title_search:
        return title_search.group(1)
    return ""

df['title'] = df.apply(title, axis=1)
df['title'].replace(title_codes, inplace=True)

# Inferring missing age values from title
def infer_age(row):
    '''
    Infers the age for nan values
    '''
    if(pd.isnull(row['age'])):
        
        if row['title'] == 1:    # Mr
            return 30
        elif row['title']  == 2:  # Mrs
            return 35
        elif row['title']  == 3:  # Miss
            return 21
        elif row['title']  == 4:  # Master
            return 4
        elif row['title']  == 5:  # Noble male
            return 40
        elif row['title']  == 6:  # Professional
            return 50
        elif row['title']  == 7:  # Noble female
            return 40

    else:
        return row['age']

df['age'] = df.apply(infer_age, axis=1)

# One hot encoding sex and title

df_sex = pd.get_dummies(df['sex'])
df_sex.columns = ['sex_{}'.format(x) for x in df_sex.columns]

df_title = pd.get_dummies(df['title'])
df_title.columns = ['title_{}'.format(x) for x in df_title.columns]

df = pd.concat([df,df_sex,df_title], axis=1)
df.drop(['sex','title'], axis=1, inplace=True)
df.head()

Unnamed: 0,passengerid,survived,pclass,name,age,sibsp,parch,ticket,fare,embarked,family_size,sex_0,sex_1,title_1,title_2,title_3,title_4,title_5,title_6,title_7
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,1.0,2,1,0,1,0,0,0,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,3.0,2,0,1,0,1,0,0,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,1.0,1,0,1,0,0,1,0,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1.0,2,0,1,0,1,0,0,0,0,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,1.0,1,1,0,1,0,0,0,0,0,0


In [6]:
# Tickets

# Only 27% of fares are unique
# Some ticket values are also grouped
# Fare value tends to identify homogeneous groups some of which may share ethnicity based upon their surname
# Language may have been a barrier to lower class passengers being directed to and getting aboard lifeboats
# Could be potential in treating it as a categorical variable and clustering to reduce the dimensions.

# Getting the true cost of individual tickets
ticket_split = df['ticket'].str.split().tolist()
ticket_number = [x[-1] for x in ticket_split]
df['ticket_number'] = ticket_number
tb_ticket_counts = df[['passengerid', 'ticket_number']].groupby('ticket_number').count()
tb_ticket_counts = tb_ticket_counts.reset_index().rename(columns={'passengerid':'passengers_per_ticket'})
df = pd.merge(
    left=df,
    right=tb_ticket_counts,
    how='left'
)

In [7]:
# Fares

# Scaling true_fare
from sklearn.preprocessing import MinMaxScaler

df['true_fare'] = round(df['fare'] / df['passengers_per_ticket'],2)
scaler = MinMaxScaler()
fare_array = df['true_fare'].as_matrix()
fare_scaled = scaler.fit_transform(fare_array.reshape(-1, 1))
df['true_fare_scaled'] = fare_scaled

In [None]:
# Names Analysis

# Evidence to suggest that non-english speakers may have had a lower chance of survival:
    # Many were in 3rd class
    # Unable to understand instructions to muster & leave
    # One of the crew removed a group of foreigners from a lifeboat at gunpoint
    # May have been unable to convince crew to let them in a lifeboat
    
    


In [26]:
features = df[['pclass', 'age', 'family_size', 'sex_0', 'sex_1',
       'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6',
       'title_7']]
label = df[['survived']]

test_models(features, label)

Logistic Regression Performance: 83.32%
SVC Performance: 82.76%
Linear SVC Performance: 80.14%
Naive Bayes Performance: 80.84%
Random Forest Performance: 80.26%


In [28]:
df.to_csv(outfile)