Import packages


In [1]:
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt


import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

Import data.

In [2]:
train=pd.read_csv("C:/Users/kpleu/Desktop/Git/Kaggle_Titanic/data/train.csv")
predict_set=pd.read_csv("C:/Users/kpleu/Desktop/Git/Kaggle_Titanic/data/test.csv")

# Combine Dataset for cleaning
dataset_cleaning=[train,predict_set]

Exploratory data analysis

Data cleaning and preprocessing:

1. Check for NaN values and decide the treatment e.g. removing the entries or imputing with meaningful values
2. Convert categorical data into numerical e.g. onehotencoder
3. Look for any abnormal data via a scatterplot matrix which also provide a very first impression of how the data looks like

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
#sns.pairplot(train.drop(['PassengerId', 'Name','Ticket'], axis=1), hue="Survived")

In [5]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
# Sex and Embarked mapping

# Define recode_sex()
def recode_Sex(value):
    # Return 0 if value is 'female'
    if value == 'female':
        return 0   
    # Return 1 if value is 'male'    
    elif value == 'male':
        return 1    
    # Return np.nan    
    else:
        return np.nan
    
# Define recode_Embarked()
def recode_Embarked(value):
    # Return 0 if value is 'C = Cherbourg'
    if value == 'C':
        return 'Cherbourg'   
    # Return 1 if value is 'Q = Queenstown'    
    elif value == 'Q':
        return 'Queenstown'
    # Return 2 if value is 'S = Southampton'    
    elif value == 'S':
        return 'Southampton'  
    # Return np.nan    
    else:
        return np.nan

for dataset in dataset_cleaning:
    dataset['Sex']=dataset['Sex'].apply(recode_Sex)
    dataset['Embarked']=dataset['Embarked'].apply(recode_Embarked)



In [7]:
train.loc[train['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",0,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",0,62.0,0,0,113572,80.0,B28,


Features that contain NaN values include: Age, Cabin, Embarked

For Cabin, NaN should be replaced by 0 to indicate that the passenger was travelling without a cabin

For Age, NaN value can be replaced by the mean value of the sub group based on sex and title (To be completed after further investigation in the dataset)

For Embarked, since the observations that are related to the 2 NaN values have most of the features repeated except for Age and Name, it is believed that there might be errors involved when entering the data. As such, removing the two particular entries should be acceptable


In [8]:
for dataset in dataset_cleaning:
    dataset['With_Cabin']=dataset['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

train=train.dropna(subset=['Embarked'])
predict_set=predict_set.dropna(subset=['Embarked'])

In [9]:
train = pd.concat([train, pd.get_dummies(train['Embarked'],drop_first=True,prefix='Embarked')], axis=1)
predict_set = pd.concat([predict_set, pd.get_dummies(predict_set['Embarked'],drop_first=True,prefix='Embarked')], axis=1)

Extract and clean the titles from the passenger's name

In [10]:
train['Title']=train['Name'].apply(lambda x: re.search('([A-Za-z]+)\.', x).group(1))
train['Title']=train['Title'].astype('category')

predict_set['Title']=predict_set['Name'].apply(lambda x: re.search('([A-Za-z]+)\.', x).group(1))
predict_set['Title']=predict_set['Title'].astype('category')

def Frequency_table(data):
    frequencytable = {}
    for key in data:
        if key in frequencytable:
            frequencytable[key] += 1
        else:
            frequencytable[key] = 1
    return frequencytable

Frequency_table(train['Title'])



{'Mr': 517,
 'Mrs': 124,
 'Miss': 181,
 'Master': 40,
 'Don': 1,
 'Rev': 6,
 'Dr': 7,
 'Mme': 1,
 'Ms': 1,
 'Major': 2,
 'Lady': 1,
 'Sir': 1,
 'Mlle': 2,
 'Col': 2,
 'Capt': 1,
 'Countess': 1,
 'Jonkheer': 1}

In [11]:
# Duplicate the column of Title and named it as Title_cleaned
train['Title_cleaned']=train['Title']
# Converting French title to English title
train['Title_cleaned']=train['Title_cleaned'].replace(['Mlle','Ms'],'Miss')
train['Title_cleaned']=train['Title_cleaned'].replace(['Mme'],'Mrs')
# Group all other title as 'Other'
train['Title_cleaned']=train['Title_cleaned'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'],'Other')
    
# Duplicate the column of Title and named it as Title_cleaned
predict_set['Title_cleaned']=predict_set['Title']
# Converting French title to English title
predict_set['Title_cleaned']=predict_set['Title_cleaned'].replace(['Mlle','Ms'],'Miss')
predict_set['Title_cleaned']=predict_set['Title_cleaned'].replace(['Mme'],'Mrs')
# Group all other title as 'Other'
predict_set['Title_cleaned']=predict_set['Title_cleaned'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'],'Other')

train = pd.concat([train, pd.get_dummies(train['Title_cleaned'],drop_first=True,prefix='Title')], axis=1)
predict_set = pd.concat([predict_set, pd.get_dummies(predict_set['Title_cleaned'],drop_first=True,prefix='Title')], axis=1)

In [12]:
predict_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 21 columns):
PassengerId             418 non-null int64
Pclass                  418 non-null int64
Name                    418 non-null object
Sex                     418 non-null int64
Age                     332 non-null float64
SibSp                   418 non-null int64
Parch                   418 non-null int64
Ticket                  418 non-null object
Fare                    417 non-null float64
Cabin                   91 non-null object
Embarked                418 non-null object
With_Cabin              418 non-null int64
Embarked_Queenstown     418 non-null uint8
Embarked_Southampton    418 non-null uint8
Title                   418 non-null category
Title_cleaned           418 non-null object
Title_Master            418 non-null uint8
Title_Miss              418 non-null uint8
Title_Mr                418 non-null uint8
Title_Mrs               418 non-null uint8
Title_Other             

Fill NaN of age with the mean age according to title group

In [13]:
age_mean_0=train[train['Title_cleaned']=='Master']['Age'].mean()
age_mean_1=train[train['Title_cleaned']=='Miss']['Age'].mean()
age_mean_2=train[train['Title_cleaned']=='Mr']['Age'].mean()
age_mean_3=train[train['Title_cleaned']=='Mrs']['Age'].mean()
age_mean_4=train[train['Title_cleaned']=='Other']['Age'].mean()

age_mean_list=[age_mean_0,age_mean_1,age_mean_2,age_mean_3,age_mean_4]


for row in range(0,len(train.index)):
    if np.isnan(train.iloc[row]['Age']):
        if train.loc[row,'Title_cleaned'] == 'Master':
             train.iat[row,5]=age_mean_list[0]
        elif train.loc[row,'Title_cleaned'] == 'Miss':
             train.iat[row,5]=age_mean_list[1]
        elif train.loc[row,'Title_cleaned'] == 'Mr':
             train.iat[row,5]=age_mean_list[2]
        elif train.loc[row,'Title_cleaned'] == 'Mrs':
             train.iat[row,5]=age_mean_list[3]
        elif train.loc[row,'Title_cleaned'] == 'Other':
             train.iat[row,5]=age_mean_list[4]

In [14]:
for row in range(0,len(predict_set.index)):
    if np.isnan(predict_set.iloc[row]['Age']):
        if predict_set.loc[row,'Title_cleaned'] == 'Master':
            predict_set.iat[row,4]=age_mean_list[0]
        elif predict_set.loc[row,'Title_cleaned'] == 'Miss':
            predict_set.iat[row,4]=age_mean_list[1]
        elif predict_set.loc[row,'Title_cleaned'] == 'Mr':
            predict_set.iat[row,4]=age_mean_list[2]
        elif predict_set.loc[row,'Title_cleaned'] == 'Mrs':
            predict_set.iat[row,4]=age_mean_list[3]
        elif predict_set.loc[row,'Title_cleaned'] == 'Other':
            predict_set.iat[row,4]=age_mean_list[4]



Drop duplicated entries, if any.

In [15]:
train=train.drop_duplicates()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 21 columns):
PassengerId             889 non-null int64
Survived                889 non-null int64
Pclass                  889 non-null int64
Name                    889 non-null object
Sex                     889 non-null int64
Age                     889 non-null float64
SibSp                   889 non-null int64
Parch                   889 non-null int64
Ticket                  889 non-null object
Fare                    889 non-null float64
Cabin                   202 non-null object
Embarked                889 non-null object
With_Cabin              889 non-null int64
Embarked_Queenstown     889 non-null uint8
Embarked_Southampton    889 non-null uint8
Title                   889 non-null category
Title_cleaned           889 non-null object
Title_Miss              889 non-null uint8
Title_Mr                889 non-null uint8
Title_Mrs               889 non-null uint8
Title_Other            

In [16]:
predict_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 21 columns):
PassengerId             418 non-null int64
Pclass                  418 non-null int64
Name                    418 non-null object
Sex                     418 non-null int64
Age                     418 non-null float64
SibSp                   418 non-null int64
Parch                   418 non-null int64
Ticket                  418 non-null object
Fare                    417 non-null float64
Cabin                   91 non-null object
Embarked                418 non-null object
With_Cabin              418 non-null int64
Embarked_Queenstown     418 non-null uint8
Embarked_Southampton    418 non-null uint8
Title                   418 non-null category
Title_cleaned           418 non-null object
Title_Master            418 non-null uint8
Title_Miss              418 non-null uint8
Title_Mr                418 non-null uint8
Title_Mrs               418 non-null uint8
Title_Other             

In [17]:
# Fill the NAN values in predict_set['Fare'] with the average
predict_set['Fare'].fillna(predict_set['Fare'].mean(), inplace = True)

In [18]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,With_Cabin,Embarked_Queenstown,Embarked_Southampton,Title,Title_cleaned,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,...,Southampton,0,0,1,Mr,Mr,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,...,Cherbourg,1,0,0,Mrs,Mrs,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,...,Southampton,0,0,1,Miss,Miss,1,0,0,0


In [19]:
#Select features to be included in the logistic model

feature=['Pclass','Sex','Age','SibSp','Parch','Fare','With_Cabin','Embarked_Queenstown','Embarked_Southampton','Title_Miss','Title_Mr','Title_Mrs','Title_Other']


In [20]:
# Create the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}

# Instantiate the logistic regression classifier: logreg
logreg = LogisticRegression(solver='liblinear')

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(train[feature],train['Survived'],test_size=0.3,random_state=42)

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg,param_grid,cv=5)

# Fit it to the training data
logreg_cv.fit(X_train,y_train)

# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))

Tuned Logistic Regression Parameter: {'C': 31.622776601683793, 'penalty': 'l1'}
Tuned Logistic Regression Accuracy: 0.8263665594855305


In [None]:
# Make prediction for sumbission 
predict_set['Survived']=logreg_cv.predict(predict_set[feature])
predict_set[['PassengerId', 'Survived']].to_csv('C:/Users/kpleu/Desktop/Git/Kaggle_Titanic/Submission/logreg.csv', index=False)