In [1]:
import pandas as pd
import numpy as np

import re # to extract text from string

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

  from numpy.core.umath_tests import inner1d


In [2]:
#Read the csv
data = pd.read_csv('titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#data Types in Numerical and categorical variable
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [4]:
# let's drop non-important columns:
# name and ticket have too many different categories, and Passenger Id is a unique key, non informative

data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [5]:
# let's inspect the dataset
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [6]:
#Missing data

data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [7]:
# Embarked and Cabin have missing date and are categorical
# let's fill NA values of categorical variables with an additional label

data.Cabin.fillna('Missing', inplace=True)
data.Embarked.fillna('Missing', inplace=True)

In [8]:
# Now we extract the first letter of the cabin

data['Cabin'] = data['Cabin'].astype(str).str[0]
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,M,S
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,M,S
3,1,1,female,35.0,1,0,53.1,C,S
4,0,3,male,35.0,0,0,8.05,M,S


In [9]:
# Let's divide into train and test set

X_train, X_test, y_train, y_test = train_test_split(data, data.Survived,
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((623, 9), (268, 9))

In [10]:
#Deal with Numerical value in categorical Variable
# Let's replace null values in age by a value at right end of distribution

def impute_na(df, variable, extreme):
    df[variable].fillna(extreme, inplace=True)
    
impute_na(X_train, 'Age', X_train.Age.mean()+3*X_train.Age.std())
impute_na(X_test, 'Age', X_train.Age.mean()+3*X_train.Age.std())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [11]:
X_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
857,1,1,male,51.0,0,0,26.55,E,S
52,1,1,female,49.0,1,0,76.7292,D,C
386,0,3,male,1.0,5,2,46.9,M,S
124,0,1,male,54.0,0,1,77.2875,D,S
578,0,3,female,73.43632,1,0,14.4583,M,C


In [12]:
# let's check that we have no missing data after NA imputation

X_train.isnull().sum(), X_test.isnull().sum()

(Survived    0
 Pclass      0
 Sex         0
 Age         0
 SibSp       0
 Parch       0
 Fare        0
 Cabin       0
 Embarked    0
 dtype: int64, Survived    0
 Pclass      0
 Sex         0
 Age         0
 SibSp       0
 Parch       0
 Fare        0
 Cabin       0
 Embarked    0
 dtype: int64)

In [13]:
#1. One Hot Encoding
#if random Forest is there so be Careful for this


def get_OHE(df):
    df_OHE = pd.concat([df[['Age', 'Fare', 'Pclass']], 
                         pd.get_dummies(df[[ 'Sex', 'Cabin', 'Embarked']], drop_first=True)],
                        axis=1
                       )
    return df_OHE

X_train_OHE = get_OHE(X_train)
X_test_OHE = get_OHE(X_test)

X_train_OHE.head()


Unnamed: 0,Age,Fare,Pclass,Sex_male,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_M,Cabin_T,Embarked_Missing,Embarked_Q,Embarked_S
857,51.0,26.55,1,1,0,0,0,1,0,0,0,0,0,0,1
52,49.0,76.7292,1,0,0,0,1,0,0,0,0,0,0,0,0
386,1.0,46.9,3,1,0,0,0,0,0,0,1,0,0,0,1
124,54.0,77.2875,1,1,0,0,1,0,0,0,0,0,0,0,1
578,73.43632,14.4583,3,0,0,0,0,0,0,0,1,0,0,0,0


In [14]:
X_test_OHE.head()

Unnamed: 0,Age,Fare,Pclass,Sex_male,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_M,Embarked_Q,Embarked_S
495,103.157714,14.4583,3,1,0,0,0,0,0,0,1,0,0
648,103.157714,7.55,3,1,0,0,0,0,0,0,1,0,1
278,7.0,29.125,3,1,0,0,0,0,0,0,1,1,0
31,103.157714,146.5208,1,0,1,0,0,0,0,0,0,0,0
255,29.0,15.2458,3,0,0,0,0,0,0,0,1,0,0


In [15]:
# for now, I will just generate the missing column in the test set and fill it with 0s
# given that no observation in the test set is positive for those labels

X_test_OHE.loc[:, 'Cabin_T'] = 0
X_test_OHE.loc[:, 'Embarked_Missing'] = 0

X_train.shape, X_test.shape

((623, 9), (268, 9))

In [16]:
#Categical To Counts

def categorical_to_counts(df_train, df_test):
    # make a temporary copy of the original dataframes
    df_train_temp = df_train.copy()
    df_test_temp = df_test.copy()
    
    for col in ['Sex', 'Cabin', 'Embarked']:
        # make the dictionary mapping label to counts
        X_frequency_map = df_train_temp[col].value_counts().to_dict()
        
        # remap the labels to their counts
        df_train_temp[col] = df_train_temp[col].map(X_frequency_map)
        df_test_temp[col] = df_test_temp[col].map(X_frequency_map)
    
    # drop the target
    df_train_temp.drop(['Survived'], axis=1, inplace=True)
    df_test_temp.drop(['Survived'], axis=1, inplace=True)
    
    return df_train_temp, df_test_temp

In [17]:
X_train_count, X_test_count = categorical_to_counts(X_train, X_test)
X_train_count.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
857,1,408,51.0,0,0,26.55,27,457
52,1,215,49.0,1,0,76.7292,26,108
386,3,408,1.0,5,2,46.9,471,457
124,1,408,54.0,0,1,77.2875,26,457
578,3,215,73.43632,1,0,14.4583,471,108


In [18]:
#3. According to the target

def categories_to_ordered(df_train, df_test):
    # make a temporary copy of the datasets 
    df_train_temp = df_train.copy()
    df_test_temp = df_test.copy()
    
    for col in [ 'Sex', 'Cabin', 'Embarked']:
        # order the labels according to target mean
        ordered_labels = df_train.groupby([col])['Survived'].mean().sort_values().index
        
        # create the dictionary to map the ordered labels to an ordinal number
        ordinal_label = {k:i for i, k in enumerate(ordered_labels, 0)} 
        
        # remap the categories  to these ordinal numbers
        df_train_temp[col] = df_train[col].map(ordinal_label)
        df_test_temp[col] = df_test[col].map(ordinal_label)
    
    # remove the target
    df_train_temp.drop(['Survived'], axis=1, inplace=True)
    df_test_temp.drop(['Survived'], axis=1, inplace=True)
    
    return df_train_temp, df_test_temp
        
X_train_ordered, X_test_ordered = categories_to_ordered(X_train, X_test)
X_train_ordered.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
857,1,0,51.0,0,0,26.55,7,0
52,1,1,49.0,1,0,76.7292,6,2
386,3,0,1.0,5,2,46.9,1,0
124,1,0,54.0,0,1,77.2875,6,0
578,3,1,73.43632,1,0,14.4583,1,2


In [19]:
#categorical to risk factor

def categories_to_risk(df_train, df_test):
    # temporary copy of the original dataframes
    df_train_temp = df_train.copy()
    df_test_temp = df_test.copy()
    
    for col in ['Sex', 'Cabin', 'Embarked']:
        # make a dictionary mapping labels to the mean target for that label
        risk_dict = df_train.groupby([col])['Survived'].mean().to_dict()
        
        # re-map the labels
        df_train_temp[col] = df_train[col].map(risk_dict)
        df_test_temp[col] = df_test[col].map(risk_dict)
    
    # drop the target
    df_train_temp.drop(['Survived'], axis=1, inplace=True)
    df_test_temp.drop(['Survived'], axis=1, inplace=True)        
    return df_train_temp, df_test_temp
        
X_train_risk, X_test_risk = categories_to_risk(X_train, X_test)
X_train_risk.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
857,1,0.196078,51.0,0,0,26.55,0.740741,0.341357
52,1,0.753488,49.0,1,0,76.7292,0.692308,0.564815
386,3,0.196078,1.0,5,2,46.9,0.303609,0.341357
124,1,0.196078,54.0,0,1,77.2875,0.692308,0.341357
578,3,0.753488,73.43632,1,0,14.4583,0.303609,0.564815


In [20]:
# weigth of Evidence

def categories_to_woe(df_train, df_test):
    # temporary copy of datafranes
    df_train_temp = df_train.copy()
    df_test_temp = df_test.copy()
    
    for col in [ 'Sex', 'Cabin', 'Embarked']:
        # create df containing the different parts of the WoE equation
        prob_df = X_train.groupby([col])['Survived'].mean()
        prob_df = pd.DataFrame(prob_df)
        prob_df['Died'] = 1-prob_df.Survived
        
        # remove zero values as the log of zero does not exist
        prob_df.loc[prob_df.Died == 0, 'Died'] = 0.00001
        prob_df.loc[prob_df.Survived == 0, 'Survived'] = 0.00001
        
        # calculate WoE
        prob_df['WoE'] = np.log(prob_df.Survived/prob_df.Died)
        woe = prob_df['WoE'].to_dict()
        
        # re-map the labels to WoE
        df_train_temp[col] = df_train[col].map(woe)
        df_test_temp[col] = df_test[col].map(woe)
    
    # drop the target
    df_train_temp.drop(['Survived'], axis=1, inplace=True)
    df_test_temp.drop(['Survived'], axis=1, inplace=True)        
    return df_train_temp, df_test_temp
        
X_train_woe, X_test_woe = categories_to_woe(X_train, X_test)
X_train_woe.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
857,1,-1.410987,51.0,0,0,26.55,1.049822,-0.657254
52,1,1.117304,49.0,1,0,76.7292,0.81093,0.260726
386,3,-1.410987,1.0,5,2,46.9,-0.830169,-0.657254
124,1,-1.410987,54.0,0,1,77.2875,0.81093,-0.657254
578,3,1.117304,73.43632,1,0,14.4583,-0.830169,0.260726


In [21]:
#Random Forest Performance

def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [22]:
# OHE
run_randomForests(X_train_OHE, X_test_OHE, y_train, y_test)

Train set
Random Forests roc-auc: 0.8823669768551659
Test set
Random Forests roc-auc: 0.8705059523809523


In [23]:
# counts
run_randomForests(X_train_count, X_test_count, y_train, y_test)

Train set
Random Forests roc-auc: 0.8935435240016485
Test set
Random Forests roc-auc: 0.8820833333333333


In [24]:
# ordered labels
run_randomForests(X_train_ordered, X_test_ordered, y_train, y_test)

Train set
Random Forests roc-auc: 0.8931530769397628
Test set
Random Forests roc-auc: 0.8794345238095238


In [25]:
# risk factor
run_randomForests(X_train_risk, X_test_risk, y_train, y_test)

Train set
Random Forests roc-auc: 0.8931422311880436
Test set
Random Forests roc-auc: 0.879375


In [26]:
# woe
run_randomForests(X_train_woe, X_test_woe, y_train, y_test)

Train set
Random Forests roc-auc: 0.8931422311880436
Test set
Random Forests roc-auc: 0.879375
