In [18]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn import model_selection

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
%matplotlib inline
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore')

# step1: Read data

In [19]:
train_src = pd.read_csv('data/titanic_data/train.csv')
train_src['Data_type'] = 'train'
pred_src = pd.read_csv('data/titanic_data/test.csv')
pred_src['Data_type'] = 'pred'
combine = pd.concat([train_src, pred_src])

# step2: Data preparation

## step2.1：Handling of missing values

In [20]:
## Fill in the missing age values with the mean value
age_mean = round(train_src['Age'].mean())
combine['Age'].fillna(age_mean, inplace= True)

# Mean replacement of 'Fare'
combine['Fare'].fillna(round(train_src['Fare'].mean()), inplace = True)

## step2.2: Data Construction

In [21]:
# Extract titles from names
# Use a function
def get_title(name):
    title_search = re.search(', ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it
    if title_search:
        return title_search.group(1)
    return np.nan

# Create a new variable named Title
combine['Title'] = combine['Name'].apply(get_title)
# Check the value of title
combine['Title'].value_counts(dropna = False)

# Similar titles are grouped together, and those that appear less frequently are classified as 'Rare'
combine['Title'] = combine['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
combine['Title'] = combine['Title'].replace('Mlle', 'Miss')
combine['Title'] = combine['Title'].replace('Ms', 'Miss')
combine['Title'] = combine['Title'].replace('Mme', 'Mrs')

# Check
combine['Title'].value_counts(dropna = False)

Title
Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Major         2
Mlle          2
Ms            2
Mme           1
Don           1
Sir           1
Lady          1
Capt          1
NaN           1
Jonkheer      1
Dona          1
Name: count, dtype: int64

Title
Mr        757
Miss      264
Mrs       198
Master     61
Rare       28
NaN         1
Name: count, dtype: int64

In [22]:
# 'SibSp' and 'Parch' are added together as 'family size'
combine['family_size'] = combine['SibSp'] + combine['Parch'] + 1
combine['family_size'].value_counts()

family_size
1     790
2     235
3     159
4      43
6      25
5      22
7      16
11     11
8       8
Name: count, dtype: int64

In [23]:
# Determine whether a passenger is a 'mother' based on gender, age, and the number of children.
combine['is_mother'] = 0
combine.loc[(combine['Sex'] == 'female') & (combine['Parch'] > 0) & (combine['Age'] > 20), 'is_mother'] = 1
combine['is_mother'].value_counts()

is_mother
0    1198
1     111
Name: count, dtype: int64

## step2.3: Data conversion

In [24]:
# One-hot encode the 'Sex' variable
sex_onehot = pd.get_dummies(combine['Sex'], drop_first = False, prefix = 'onehot', dtype = int)
sex_onehot

Unnamed: 0,onehot_female,onehot_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
413,0,1
414,1,0
415,0,1
416,0,1


In [25]:
# Perform the same operation on 'Age', but first discretize 'Age'
combine['Age_group'] = np.nan
combine.loc[combine['Age'] <= 16, 'Age_group'] = 'Age_0_16'
combine.loc[(combine['Age'] > 16) & (combine['Age'] <= 32), 'Age_group'] = 'Age_16_32'
combine.loc[(combine['Age'] > 32) & (combine['Age'] <= 48), 'Age_group'] = 'Age_32_48'
combine.loc[(combine['Age'] > 48) & (combine['Age'] <= 64), 'Age_group'] = 'Age_48_64'
combine.loc[combine['Age'] > 64, 'Age_group'] = 'Age_64_'
age_group_onehot = pd.get_dummies(combine['Age_group'], drop_first = False, prefix = 'onehot', dtype = int)
age_group_onehot.head(10)

Unnamed: 0,onehot_Age_0_16,onehot_Age_16_32,onehot_Age_32_48,onehot_Age_48_64,onehot_Age_64_
0,0,1,0,0,0
1,0,0,1,0,0
2,0,1,0,0,0
3,0,0,1,0,0
4,0,0,1,0,0
5,0,1,0,0,0
6,0,0,0,1,0
7,1,0,0,0,0
8,0,1,0,0,0
9,1,0,0,0,0


In [26]:
# title
title_onehot = pd.get_dummies(combine['Title'], drop_first = False, prefix = 'onehot', dtype = int)
title_onehot.head()

Unnamed: 0,onehot_Master,onehot_Miss,onehot_Mr,onehot_Mrs,onehot_Rare
0,0,0,1,0,0
1,0,0,0,1,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,0,1,0,0


In [27]:
# Pclass
Pclass_onehot = pd.get_dummies(combine['Pclass'], drop_first = False, prefix = 'onehot_pclass', dtype = int)
Pclass_onehot.head()

Unnamed: 0,onehot_pclass_1,onehot_pclass_2,onehot_pclass_3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [28]:
# All variables merged
combine = pd.concat([combine, sex_onehot, age_group_onehot, title_onehot, Pclass_onehot], axis = 1)
combine.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,onehot_Age_48_64,onehot_Age_64_,onehot_Master,onehot_Miss,onehot_Mr,onehot_Mrs,onehot_Rare,onehot_pclass_1,onehot_pclass_2,onehot_pclass_3
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,0,0,0,0,1,0,0,0,0,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,0,0,0,0,1,0,1,0,0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,0,0,0,1,0,0,0,0,0,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,0,0,0,0,0,1,0,1,0,0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,0,0,0,0,1,0,0,0,0,1


# step3: Training models

In [29]:
# Prepare datasets X and y
train_X = combine.loc[combine['Data_type'] == 'train', ['SibSp', 'Parch', 'Fare',
                        'family_size', 'is_mother', 'onehot_male', 'onehot_female', 
                        'onehot_Age_0_16', 'onehot_Age_16_32', 'onehot_Age_32_48', 'onehot_Age_48_64', 'onehot_Age_64_',
                        'onehot_Master', 'onehot_Miss', 'onehot_Mr', 'onehot_Mrs', 'onehot_Rare',
                        'onehot_pclass_1', 'onehot_pclass_2', 'onehot_pclass_3']]

train_y = combine.loc[combine['Data_type'] == 'train', 'Survived']

train_X.head(10)

Unnamed: 0,SibSp,Parch,Fare,family_size,is_mother,onehot_male,onehot_female,onehot_Age_0_16,onehot_Age_16_32,onehot_Age_32_48,onehot_Age_48_64,onehot_Age_64_,onehot_Master,onehot_Miss,onehot_Mr,onehot_Mrs,onehot_Rare,onehot_pclass_1,onehot_pclass_2,onehot_pclass_3
0,1,0,7.25,2,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1
1,1,0,71.2833,2,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0
2,0,0,7.925,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1
3,1,0,53.1,2,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0
4,0,0,8.05,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1
5,0,0,8.4583,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1
6,0,0,51.8625,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0
7,3,1,21.075,5,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1
8,0,2,11.1333,3,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1
9,1,0,30.0708,2,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0


In [30]:
# split
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_X, train_y, test_size = 0.3, random_state = 42)

In [31]:
# LR
lr = LogisticRegression()
lr.fit(X_train, y_train)

print("train accuracy: %f" %(lr.score(X_train, y_train)))
print("test accuracy: %f" %(lr.score(X_test, y_test)))

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


train accuracy: 0.834671
test accuracy: 0.820896


In [32]:
# Check variable weights
pd.DataFrame(list(zip(np.transpose(lr.coef_), train_X.columns)), columns = ['coef', 'columns'])

Unnamed: 0,coef,columns
0,[-0.28345630129197547],SibSp
1,[-0.21672929733538743],Parch
2,[0.0072454932826704985],Fare
3,[-0.20688492279829426],family_size
4,[0.1770583402043277],is_mother
5,[-0.6480136337662938],onehot_male
6,[0.9413143095954052],onehot_female
7,[0.8485194475716158],onehot_Age_0_16
8,[0.04253885537747692],onehot_Age_16_32
9,[-0.13996233891593932],onehot_Age_32_48


In [33]:
# SVM
svc = SVC()
svc.fit(X_train, y_train)
svc.score(X_train, y_train)
svc.score(X_test, y_test)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


0.6821829855537721

0.6567164179104478

In [34]:
# KNeighbors
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
knn.score(X_train, y_train)
knn.score(X_test, y_test)

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


0.8667736757624398

0.7835820895522388

In [35]:
# DecisionTree
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtree.score(X_train, y_train)
dtree.score(X_test, y_test)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


0.9502407704654896

0.8171641791044776

In [36]:
# random forest
random_forest = RandomForestClassifier(n_estimators = 10)
random_forest.fit(X_train, y_train)
random_forest.score(X_train, y_train)
random_forest.score(X_test, y_test)

0,1,2
,n_estimators,10
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


0.9373996789727127

0.8059701492537313

# step4：Prediction on new data

In [37]:
# It has been concatenated before, so just filter out the required variables directly
pred_X = combine.loc[combine['Data_type'] == 'pred', ['SibSp', 'Parch', 'Fare',
                        'family_size', 'is_mother', 'onehot_male', 'onehot_female', 
                        'onehot_Age_0_16', 'onehot_Age_16_32', 'onehot_Age_32_48', 'onehot_Age_48_64', 'onehot_Age_64_',
                        'onehot_Master', 'onehot_Miss', 'onehot_Mr', 'onehot_Mrs', 'onehot_Rare',
                        'onehot_pclass_1', 'onehot_pclass_2', 'onehot_pclass_3']]

pred_id = combine.loc[combine['Data_type'] == 'pred', 'PassengerId']

In [39]:
# Test data prediction
pred_y_pre = lr.predict(pred_X)
test_pred = pd.DataFrame({'PassengerId' : pred_id,
                          'Survived' : pred_y_pre.astype(int)})
test_pred.to_csv('data/titanic_data/test_pred_2.csv', index = False)