Import packages


In [None]:
import pandas as pd
import numpy as np
import math
import re

import seaborn as sns
import matplotlib.pyplot as plt


import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

Import data.

In [None]:
train=pd.read_csv("C:/Users/kpleu/Desktop/Git/Kaggle_Titanic/data/train.csv")
predict_set=pd.read_csv("C:/Users/kpleu/Desktop/Git/Kaggle_Titanic/data/test.csv")

# Combine Dataset for cleaning
dataset_cleaning=[train,predict_set]

Exploratory data analysis

Data cleaning and preprocessing:

1. Check for NaN values and decide the treatment e.g. removing the entries or imputing with meaningful values
2. Convert categorical data into numerical e.g. onehotencoder
3. Look for any abnormal data via a scatterplot matrix which also provide a very first impression of how the data looks like

In [None]:
train.info()

In [None]:
#sns.pairplot(train.drop(['PassengerId', 'Name','Ticket'], axis=1), hue="Survived")

In [None]:
train.describe()

In [None]:
# Sex and Embarked mapping

# Define recode_sex()
def recode_Sex(value):
    # Return 0 if value is 'female'
    if value == 'female':
        return 0   
    # Return 1 if value is 'male'    
    elif value == 'male':
        return 1    
    # Return np.nan    
    else:
        return np.nan
    
# Define recode_Embarked()
def recode_Embarked(value):
    # Return 0 if value is 'C = Cherbourg'
    if value == 'C':
        return 'Cherbourg'   
    # Return 1 if value is 'Q = Queenstown'    
    elif value == 'Q':
        return 'Queenstown'
    # Return 2 if value is 'S = Southampton'    
    elif value == 'S':
        return 'Southampton'  
    # Return np.nan    
    else:
        return np.nan

for dataset in dataset_cleaning:
    dataset['Sex']=dataset['Sex'].apply(recode_Sex)
    dataset['Embarked']=dataset['Embarked'].apply(recode_Embarked)



In [None]:
train.loc[train['Embarked'].isnull()]

Features that contain NaN values include: Age, Cabin, Embarked

For Cabin, NaN should be replaced by 0 to indicate that the passenger was travelling without a cabin

For Age, NaN value can be replaced by the mean value of the sub group based on sex and title (To be completed after further investigation in the dataset)

For Embarked, since the observations that are related to the 2 NaN values have most of the features repeated except for Age and Name, it is believed that there might be errors involved when entering the data. As such, removing the two particular entries should be acceptable


In [None]:
for dataset in dataset_cleaning:
    dataset['With_Cabin']=dataset['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

train=train.dropna(subset=['Embarked'])
predict_set=predict_set.dropna(subset=['Embarked'])

In [None]:
train = pd.concat([train, pd.get_dummies(train['Embarked'],drop_first=True,prefix='Embarked')], axis=1)
predict_set = pd.concat([predict_set, pd.get_dummies(predict_set['Embarked'],drop_first=True,prefix='Embarked')], axis=1)

Extract and clean the titles from the passenger's name

In [None]:
train['Title']=train['Name'].apply(lambda x: re.search('([A-Za-z]+)\.', x).group(1))
train['Title']=train['Title'].astype('category')

predict_set['Title']=predict_set['Name'].apply(lambda x: re.search('([A-Za-z]+)\.', x).group(1))
predict_set['Title']=predict_set['Title'].astype('category')

def Frequency_table(data):
    frequencytable = {}
    for key in data:
        if key in frequencytable:
            frequencytable[key] += 1
        else:
            frequencytable[key] = 1
    return frequencytable

Frequency_table(train['Title'])



In [None]:
# Duplicate the column of Title and named it as Title_cleaned
train['Title_cleaned']=train['Title']
# Converting French title to English title
train['Title_cleaned']=train['Title_cleaned'].replace(['Mlle','Ms'],'Miss')
train['Title_cleaned']=train['Title_cleaned'].replace(['Mme'],'Mrs')
# Group all other title as 'Other'
train['Title_cleaned']=train['Title_cleaned'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'],'Other')
    
# Duplicate the column of Title and named it as Title_cleaned
predict_set['Title_cleaned']=predict_set['Title']
# Converting French title to English title
predict_set['Title_cleaned']=predict_set['Title_cleaned'].replace(['Mlle','Ms'],'Miss')
predict_set['Title_cleaned']=predict_set['Title_cleaned'].replace(['Mme'],'Mrs')
# Group all other title as 'Other'
predict_set['Title_cleaned']=predict_set['Title_cleaned'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'],'Other')

train = pd.concat([train, pd.get_dummies(train['Title_cleaned'],drop_first=True,prefix='Title')], axis=1)
predict_set = pd.concat([predict_set, pd.get_dummies(predict_set['Title_cleaned'],drop_first=True,prefix='Title')], axis=1)

In [None]:
predict_set.info()

Fill NaN of age with the mean age according to title group

In [None]:
age_mean_0=train[train['Title_cleaned']=='Master']['Age'].mean()
age_mean_1=train[train['Title_cleaned']=='Miss']['Age'].mean()
age_mean_2=train[train['Title_cleaned']=='Mr']['Age'].mean()
age_mean_3=train[train['Title_cleaned']=='Mrs']['Age'].mean()
age_mean_4=train[train['Title_cleaned']=='Other']['Age'].mean()

age_mean_list=[age_mean_0,age_mean_1,age_mean_2,age_mean_3,age_mean_4]


for row in range(0,len(train.index)):
    if math.isnan(train.iloc[row]['Age']):
        if train.loc[row,'Title_cleaned'] == 'Master':
             train.iat[row,5]=age_mean_list[0]
        elif train.loc[row,'Title_cleaned'] == 'Miss':
             train.iat[row,5]=age_mean_list[1]
        elif train.loc[row,'Title_cleaned'] == 'Mr':
             train.iat[row,5]=age_mean_list[2]
        elif train.loc[row,'Title_cleaned'] == 'Mrs':
             train.iat[row,5]=age_mean_list[3]
        elif train.loc[row,'Title_cleaned'] == 'Other':
             train.iat[row,5]=age_mean_list[4]

In [None]:
for row in range(0,len(predict_set.index)):
    if math.isnan(predict_set.iloc[row]['Age']):
        if predict_set.loc[row,'Title_cleaned'] == 'Master':
            predict_set.iat[row,4]=age_mean_list[0]
        elif predict_set.loc[row,'Title_cleaned'] == 'Miss':
            predict_set.iat[row,4]=age_mean_list[1]
        elif predict_set.loc[row,'Title_cleaned'] == 'Mr':
            predict_set.iat[row,4]=age_mean_list[2]
        elif predict_set.loc[row,'Title_cleaned'] == 'Mrs':
            predict_set.iat[row,4]=age_mean_list[3]
        elif predict_set.loc[row,'Title_cleaned'] == 'Other':
            predict_set.iat[row,4]=age_mean_list[4]



Drop duplicated entries, if any.

In [None]:
train=train.drop_duplicates()
train.info()

In [None]:
predict_set.info()

In [None]:
# Fill the NAN values in predict_set['Fare'] with the average
predict_set['Fare'].fillna(predict_set['Fare'].mean(), inplace = True)

In [None]:
train.head(3)

In [None]:
#Select features to be included in the logistic model

feature=['Pclass','Sex','Age','SibSp','Parch','Fare','With_Cabin','Embarked_Queenstown','Embarked_Southampton','Title_Miss','Title_Mr','Title_Mrs','Title_Other']


In [None]:
# Create the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}

# Instantiate the logistic regression classifier: logreg
logreg = LogisticRegression(solver='liblinear')

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(train[feature],train['Survived'],test_size=0.3,random_state=42)

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg,param_grid,cv=5)

# Fit it to the training data
logreg_cv.fit(X_train,y_train)

# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))

In [None]:
# Make prediction for sumbission 
predict_set['Survived']=logreg_cv.predict(predict_set[feature])
predict_set[['PassengerId', 'Survived']].to_csv('C:/Users/kpleu/Desktop/Git/Kaggle_Titanic/Submission/logreg.csv', index=False)