# Titanic Dataset

In [646]:
# Import statements
import pandas as pd
import numpy as np

In [766]:
# Loading the train CSV
train = pd.read_csv('./train.csv')
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [684]:
train.shape

(891, 12)

In [685]:
# Checking out the null values
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [686]:
# Checking out the unique values in each column
train.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [687]:
train_base = initial_cleaner(train)
train_base.head(3)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110152,Ticket_110413,...,Ticket_SW/PP 751,Ticket_W./C. 14258,Ticket_W./C. 14263,Ticket_W./C. 6608,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Baseline

In [688]:
# Baseline
print('Baseline: ', train['Survived'].value_counts(normalize=True)[0])

Baseline:  0.6161616161616161


## Baseline Logictic Regression

In [689]:
# For a baseline I will drop the Name and Cabin columns, drop the nan's and get dummies ready for modelling
def initial_cleaner(df):
    df = df.drop(['PassengerId', 'Name', 'Cabin'], axis=1)
    df = df.dropna()
    df = pd.get_dummies(df)
    return df

In [690]:
train_base = initial_cleaner(train)
train_base.head(3)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110152,Ticket_110413,...,Ticket_SW/PP 751,Ticket_W./C. 14258,Ticket_W./C. 14263,Ticket_W./C. 6608,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [691]:
# Running a Logistic Regression for a baseline model estimate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

y = train_base['Survived'].values
X = StandardScaler().fit_transform(train_base.drop(['Survived'], axis=1))

lr = LogisticRegression(solver='lbfgs')

scores = cross_val_score(lr, X, y, cv=5)
print('R2 scores:', scores)
print('Mean R2 score:', scores.mean())

R2 scores: [0.81118881 0.82517483 0.8041958  0.80985915 0.83687943]
Mean R2 score: 0.8174596056226262


## Feature Engineering

In [727]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


### Name and Titles

In [767]:
def name_cleaner(df):
    
    """Code for cleaning the Name column"""
    names = df['Name'].str.split(", ", n=1, expand=True)
    names.columns = ['Last Name', 'First Name'] 
    titles = names['First Name'].str.split(". ", n=1, expand=True)
    titles.columns = ['Title', 'First Name'] 
    middle = titles['First Name'].str.split(" ", n=1, expand=True)
    middle.columns = ['First Name', 'Other Name']
    df['Title'] = titles['Title']
    df['First Name'] = middle['First Name']
    df['Other Name'] = middle['Other Name']
    df['Last Name'] = names['Last Name']
    df.drop('Name', axis=1, inplace=True)
    df['Other Name'].fillna('Na', inplace=True)
    
    """Code for cleaning the Title column"""
    titles = ['Mr', 'Miss', 'Mrs', 'Master', 'Dr', 'Rev', 'Mlle', 'Col', 'Major']
    df['Title'] = df['Title'].apply(lambda x: x if x in titles else 'Other')
    
    return df

In [768]:
train = name_cleaner(train)
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,First Name,Other Name,Last Name
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr,Owen,Harris,Braund
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,John,Bradley (Florence Briggs Thayer),Cumings
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Laina,Na,Heikkinen


### Status

In [769]:
def status_creator(df):
    
    """Code for creating a Status column"""
    df['Status'] = 'unknown'    
    single = df[(df['SibSp'] == 0) & (df['Parch'] == 0)].index.tolist()
    df['Status'].loc[single] = 'single'    
    with_sibsp_without_parch = df[(df['SibSp'] == 0) & (df['Parch'] != 0)].index.tolist()
    df['Status'].loc[with_sibsp_without_parch] = 'with_sibsp_without_parch'    
    with_parch_without_sibsp = df[(train['SibSp'] != 0) & (df['Parch'] == 0)].index.tolist()
    df['Status'].loc[with_parch_without_sibsp] = 'with_parch_without_sibsp'    
    families = df[(df['SibSp'] != 0) & (df['Parch'] != 0)].index.tolist()
    df['Status'].loc[families] = 'families'
    
    return df  

In [770]:
train = status_creator(train)
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,First Name,Other Name,Last Name,Status
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr,Owen,Harris,Braund,with_parch_without_sibsp
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,John,Bradley (Florence Briggs Thayer),Cumings,with_parch_without_sibsp
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Laina,Na,Heikkinen,single


### Sex

In [771]:
def sex_changer(df):
    
    """Code of binerising the Sex column"""
    df['Sex'] = df['Sex'].astype(str).apply(lambda x: 1 if x == 'male' else 0)
    
    return df

In [772]:
train = sex_changer(train)
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,First Name,Other Name,Last Name,Status
0,1,0,3,1,22.0,1,0,A/5 21171,7.25,,S,Mr,Owen,Harris,Braund,with_parch_without_sibsp
1,2,1,1,0,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,John,Bradley (Florence Briggs Thayer),Cumings,with_parch_without_sibsp
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Laina,Na,Heikkinen,single


### Ticket and Fare

In [776]:
def ticket_checker(df):
    
    """Code for filling in the Ticket column"""
    train['Ticket'].fillna('Na', inplace=True)
    
    return df

In [777]:
train = ticket_checker(train)
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,First Name,Other Name,Last Name,Status,Cabin Letter,Cabin No
0,1,0,3,1,22.0,1,0,A/5 21171,7.25,,S,Mr,Owen,Harris,Braund,with_parch_without_sibsp,Na,Na
1,2,1,1,0,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,John,Bradley (Florence Briggs Thayer),Cumings,with_parch_without_sibsp,C,85
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Laina,Na,Heikkinen,single,Na,Na


### Cabins

In [773]:
def cabin_counter(df):
    
    """Code for extracting info from the Cabin column"""
    df['Cabin Letter'] = df['Cabin'].str[:1]
    df['Cabin Letter'].fillna('Na', inplace=True)
    df['Cabin No'] = df['Cabin'].str[1:3]
    df['Cabin No'].fillna('Na', inplace=True)
    
    return df

In [774]:
train = cabin_counter(train)
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,First Name,Other Name,Last Name,Status,Cabin Letter,Cabin No
0,1,0,3,1,22.0,1,0,A/5 21171,7.25,,S,Mr,Owen,Harris,Braund,with_parch_without_sibsp,Na,Na
1,2,1,1,0,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,John,Bradley (Florence Briggs Thayer),Cumings,with_parch_without_sibsp,C,85
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Laina,Na,Heikkinen,single,Na,Na


## Titanic Cleaner

In [807]:
def titanic_cleaner(df):
    
    """Amalgamating the Cleaners"""
    df = name_cleaner(df)
    df = status_creator(df)
    df = sex_changer(df)
    df = ticket_checker(df)
    df = cabin_counter(df)
    
    return df

In [818]:
train = pd.read_csv('./train.csv')

In [819]:
train = titanic_cleaner(train)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,First Name,Other Name,Last Name,Status,Cabin Letter,Cabin No
0,1,0,3,1,22.0,1,0,A/5 21171,7.25,,S,Mr,Owen,Harris,Braund,with_parch_without_sibsp,Na,Na
1,2,1,1,0,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,John,Bradley (Florence Briggs Thayer),Cumings,with_parch_without_sibsp,C,85
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Laina,Na,Heikkinen,single,Na,Na
3,4,1,1,0,35.0,1,0,113803,53.1,C123,S,Mrs,Jacques,Heath (Lily May Peel),Futrelle,with_parch_without_sibsp,C,12
4,5,0,3,1,35.0,0,0,373450,8.05,,S,Mr,William,Henry,Allen,single,Na,Na


## Preprocessing

In [820]:
def titanic_preprocessor(df):
    
    """Removing Nans and Getting Dummies"""
    df.drop(['PassengerId', 'Cabin', 'Status'], axis=1, inplace=True)
    df.dropna(inplace=True)
    df = pd.get_dummies(train)
    
    return df

In [821]:
train = titanic_preprocessor(train)
train.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Ticket_110152,Ticket_110413,Ticket_110465,...,Cabin No_87,Cabin No_9,Cabin No_90,Cabin No_91,Cabin No_92,Cabin No_93,Cabin No_94,Cabin No_96,Cabin No_99,Cabin No_Na
0,0,3,1,22.0,1,0,7.25,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,0,38.0,1,0,71.2833,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,3,0,26.0,0,0,7.925,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Modelling Logistic Regression

In [816]:
# Running Logitic Regression to see if the feature engineering made any improvments 
y = train['Survived'].values
X = StandardScaler().fit_transform(train)

lr = LogisticRegression(solver='lbfgs')

lr_scores = cross_val_score(lr, X, y, cv=5)
print('R2 scores:', lr_scores)
print('Mean R2 score:', lr_scores.mean())

R2 scores: [0.9020979  0.8951049  0.9020979  0.87323944 0.91489362]
Mean R2 score: 0.8974867505883388


## Modelling Random Forest

In [827]:
# Trying Random Forest
from sklearn.ensemble import RandomForestClassifier

y = train['Survived'].values
X = StandardScaler().fit_transform(train)

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X, y)

rfc_scores = cross_val_score(rfc, X, y, cv=5)
print('R2 scores:', rfc_scores)
print('Mean R2 score:', rfc_scores.mean())

R2 scores: [1.         0.97202797 0.99300699 0.99295775 1.        ]
Mean R2 score: 0.9915985423027676
