In [2]:
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


In [3]:
train=pd.read_csv("C:/Users/kpleu/Desktop/Git/Kaggle_Titanic/data/train.csv")
predict_set=pd.read_csv("C:/Users/kpleu/Desktop/Git/Kaggle_Titanic/data/test.csv")

# Combine Dataset for cleaning
dataset_combined=[train,predict_set]


In [4]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
features_numerical=['Age','Fare']
features_ordinal=['SibSp','Parch']
features_categorical=['Pclass','Sex','Embarked']
features_other=['Name','Ticket','Cabin']


In [7]:
# Based on features_other, we create two new features:
# 1. Titles: 'Mr', 'Mrs', 'Miss', etc
# 2. With_Cabin: Whether the individual is staying in a cabin or not

train['Title']=train['Name'].apply(lambda x: re.search('([A-Za-z]+)\.', x).group(1))
train['Title']=train['Title'].astype('category')

predict_set['Title']=predict_set['Name'].apply(lambda x: re.search('([A-Za-z]+)\.', x).group(1))
predict_set['Title']=predict_set['Title'].astype('category')


# Duplicate the column of Title and named it as Title_cleaned
train['Title_cleaned']=train['Title']
# Converting French title to English title
train['Title_cleaned']=train['Title_cleaned'].replace(['Mlle','Ms'],'Miss')
train['Title_cleaned']=train['Title_cleaned'].replace(['Mme'],'Mrs')
# Group all other title as 'Other'
train['Title_cleaned']=train['Title_cleaned'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'],'Other')
    
# Duplicate the column of Title and named it as Title_cleaned
predict_set['Title_cleaned']=predict_set['Title']
# Converting French title to English title
predict_set['Title_cleaned']=predict_set['Title_cleaned'].replace(['Mlle','Ms'],'Miss')
predict_set['Title_cleaned']=predict_set['Title_cleaned'].replace(['Mme'],'Mrs')
# Group all other title as 'Other'
predict_set['Title_cleaned']=predict_set['Title_cleaned'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'],'Other')

#
for dataset in dataset_combined:
    dataset['With_Cabin']=dataset['Cabin'].apply(lambda x: 0 if type(x) == float else 1)


In [8]:
# Update categorical features list

features_categorical=['Pclass','Sex','Embarked','Title_cleaned','With_Cabin']

In [9]:
# For numerical features, check for nan values and outlier
for dataset in dataset_combined:
    print(dataset[features_numerical].info())
    print(" ")
    print(dataset[features_numerical].describe())
    print(" ")

# With the info and describe, Age contains nan in both training set and test set while Fare contains 1 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
Age     714 non-null float64
Fare    891 non-null float64
dtypes: float64(2)
memory usage: 14.0 KB
None
 
              Age        Fare
count  714.000000  891.000000
mean    29.699118   32.204208
std     14.526497   49.693429
min      0.420000    0.000000
25%     20.125000    7.910400
50%     28.000000   14.454200
75%     38.000000   31.000000
max     80.000000  512.329200
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
Age     332 non-null float64
Fare    417 non-null float64
dtypes: float64(2)
memory usage: 6.6 KB
None
 
              Age        Fare
count  332.000000  417.000000
mean    30.272590   35.627188
std     14.181209   55.907576
min      0.170000    0.000000
25%     21.000000    7.895800
50%     27.000000   14.454200
75%     39.000000   31.500000
max     76.000000  512.329200
 


In [10]:
# Define frequency function to record the counts for each ordinal and categorical variables

def Frequency_table(data):
    frequencytable = {}
    for key in data:
        if key in frequencytable:
            frequencytable[key] += 1
        else:
            frequencytable[key] = 1
    return frequencytable


In [11]:
# For ordinal features, check for nan values and errors in the entries, if any

for i in features_ordinal:
    print(Frequency_table(train[i]))
    
for i in features_ordinal:
    print(Frequency_table(predict_set[i]))
    
train[features_ordinal]=train[features_ordinal].astype(float)
predict_set[features_ordinal]=predict_set[features_ordinal].astype(float)


{1: 209, 0: 608, 3: 16, 4: 18, 2: 28, 5: 5, 8: 7}
{0: 678, 1: 118, 2: 80, 5: 5, 3: 5, 4: 4, 6: 1}
{0: 283, 1: 110, 2: 14, 3: 4, 4: 4, 5: 1, 8: 2}
{0: 324, 1: 52, 3: 3, 2: 33, 4: 2, 6: 1, 5: 1, 9: 2}


In [12]:
# For categorical features, check for nan values and errors in the entries, if any

for i in features_categorical:
    print(Frequency_table(train[i]))
    
for i in features_categorical:
    print(Frequency_table(predict_set[i]))
#Based on the result from the frequency table, only the 2 nan values from 'Embarked' column from the test data set have to be taken care of

{3: 491, 1: 216, 2: 184}
{'male': 577, 'female': 314}
{'S': 644, 'C': 168, 'Q': 77, nan: 2}
{'Mr': 517, 'Mrs': 126, 'Miss': 185, 'Master': 40, 'Other': 23}
{0: 687, 1: 204}
{3: 218, 2: 93, 1: 107}
{'male': 266, 'female': 152}
{'Q': 46, 'S': 270, 'C': 102}
{'Mr': 240, 'Mrs': 72, 'Miss': 79, 'Master': 21, 'Other': 5, 'Dona': 1}
{0: 327, 1: 91}


In [13]:
# Our strategy is to create a pipeline that does the followings:
# For numerical features, we impute the nan values with the median of the corresponding features and the features are scaled properly before feeding into the model
# For ordinal features, we scale the feature
# For categorical features, impute nan values with the mode and encode the lables before feeding into the model

In [14]:
# We create the preprocessing pipelines for all three types of data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features_numerical),
        ('ordin', ordinal_transformer, features_ordinal),
        ('cat', categorical_transformer, features_categorical)])

# Define the pipeline with logistic classifier
logreg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='liblinear'))])


In [15]:
# Create the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'classifier__C': c_space, 'classifier__penalty': ['l1', 'l2']}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(train,train['Survived'],test_size=0.3,random_state=42)

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg,param_grid,cv=5)

# Fit it to the training data
logreg_cv.fit(X_train,y_train)

# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))

Tuned Logistic Regression Parameter: {'classifier__C': 3.727593720314938, 'classifier__penalty': 'l1'}
Tuned Logistic Regression Accuracy: 0.8362760834670947


In [16]:
# Make prediction for sumbission 
predict_set['Survived']=logreg_cv.predict(predict_set)
predict_set[['PassengerId', 'Survived']].to_csv('C:/Users/kpleu/Desktop/Git/Kaggle_Titanic/Submission/logreg1.csv', index=False)