In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# The machine learning models.
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# To evaluate the models.
from sklearn.metrics import roc_auc_score

# To separate data into train and test.
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('desktop/feature-engineering-for-machine-learning-main/feature-engineering-for-machine-learning-main/titanic.csv')

In [3]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
#categorical variables are Name, Sex, Ticket, Cabin and Embarked.

#Ticket and Cabin contain both letters and numbers, so they could be treated as Mixed Variables.
#In this demo, I will treat them as categorical.

In [5]:
# cardinality: the number of different labels.

print('Number of categories in the variable Name: {}'.format(
    len(data.name.unique())))

print('Number of categories in the variable Gender: {}'.format(
    len(data.sex.unique())))

print('Number of categories in the variable Ticket: {}'.format(
    len(data.ticket.unique())))

print('Number of categories in the variable Cabin: {}'.format(
    len(data.cabin.unique())))

print('Number of categories in the variable Embarked: {}'.format(
    len(data.embarked.unique())))

print('Total number of passengers in the Titanic: {}'.format(len(data)))

Number of categories in the variable Name: 1307
Number of categories in the variable Gender: 2
Number of categories in the variable Ticket: 929
Number of categories in the variable Cabin: 182
Number of categories in the variable Embarked: 4
Total number of passengers in the Titanic: 1309


In [6]:
# the variables Ticket, Name, and Cabin, as expected, contain a huge number of different labels (high cardinality).

#To demonstrate the effect of high cardinality on train and test sets and on machine learning performance,
#I will work with the variable cabin. I will create a new variable with reduced cardinality.

In [7]:
# explore the values of Cabin.
data.cabin.unique()

array(['B5', 'C22', 'E12', 'D7', 'A36', 'C101', nan, 'C62', 'B35', 'A23',
       'B58', 'D15', 'C6', 'D35', 'C148', 'C97', 'B49', 'C99', 'C52', 'T',
       'A31', 'C7', 'C103', 'D22', 'E33', 'A21', 'B10', 'B4', 'E40',
       'B38', 'E24', 'B51', 'B96', 'C46', 'E31', 'E8', 'B61', 'B77', 'A9',
       'C89', 'A14', 'E58', 'E49', 'E52', 'E45', 'B22', 'B26', 'C85',
       'E17', 'B71', 'B20', 'A34', 'C86', 'A16', 'A20', 'A18', 'C54',
       'C45', 'D20', 'A29', 'C95', 'E25', 'C111', 'C23', 'E36', 'D34',
       'D40', 'B39', 'B41', 'B102', 'C123', 'E63', 'C130', 'B86', 'C92',
       'A5', 'C51', 'B42', 'C91', 'C125', 'D10', 'B82', 'E50', 'D33',
       'C83', 'B94', 'D49', 'D45', 'B69', 'B11', 'E46', 'C39', 'B18',
       'D11', 'C93', 'B28', 'C49', 'B52', 'E60', 'C132', 'B37', 'D21',
       'D19', 'C124', 'D17', 'B101', 'D28', 'D6', 'D9', 'B80', 'C106',
       'B79', 'C47', 'D30', 'C90', 'E38', 'C78', 'C30', 'C118', 'D36',
       'D48', 'D47', 'C105', 'B36', 'B30', 'D43', 'B24', 'C2', 'C65',


In [8]:
# Let's reduce the cardinality of the variable. 
#How? Instead of using the entire value (letter + number), I will only use the first letter.
#the first letter indicates the deck on which the cabin was located, 
#indicating both social class status and proximity to the Titanic's surface. 
#Both are known to improve the probability of survival.

In [9]:
# Let's capture the first letter of cabin.

data['Cabin_reduced'] = data['cabin'].astype(str).str[0]
# .astype(str) converts the values in the 'cabin' column to strings. 
#This step ensures that all values in the 'cabin' column are treated as strings,
#as they may originally be of different data types.

data[['cabin', 'Cabin_reduced']].head()

Unnamed: 0,cabin,Cabin_reduced
0,B5,B
1,C22,C
2,C22,C
3,C22,C
4,C22,C


In [10]:
print('Number of categories in the variable Cabin: {}'.format(
    len(data.cabin.unique())))

print('Number of categories in the variable Cabin reduced: {}'.format(
    len(data.Cabin_reduced.unique())))

Number of categories in the variable Cabin: 182
Number of categories in the variable Cabin reduced: 9


In [11]:
# Let's separate the data into training and testing sets.

use_cols = ['cabin', 'Cabin_reduced', 'sex']
X_train, X_test, y_train, y_test = train_test_split( data[use_cols], data['survived'],test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((916, 3), (393, 3))

In [14]:
## Uneven distribution of categories
#When a variable is highly cardinal, some categories appear only on the training set, and others only on the testing set.
#If present only in the training set, they may cause over-fitting.
#If present only on the testing set, the machine learning model will not know how to handle them, as they were not seen during training.


In [15]:
# Labels present only in the training set:

unique_to_train_set = [
    x for x in X_train.cabin.unique() if x not in X_test.cabin.unique()
]

len(unique_to_train_set)

113

In [16]:
# There are 113 Cabins only present in the training set.

In [18]:
 #Labels present only in the test set.

unique_to_test_set = [
    x for x in X_test.cabin.unique() if x not in X_train.cabin.unique()
]

len(unique_to_test_set)

36

In [19]:
# This will cause problems at the time of training (over-fitting) and
#scoring of new data (how will the model deal with unseen categories?).

#This problem can be mitigated by reducing the cardinality of the variable. 

In [20]:
# Labels present only in the training set for Cabin with reduced cardinality.

unique_to_train_set = [
    x for x in X_train['Cabin_reduced'].unique()
    if x not in X_test['Cabin_reduced'].unique()
]

len(unique_to_train_set)

1

In [21]:
# Labels present only in the test set
# for Cabin with reduced cardinality.

unique_to_test_set = [
    x for x in X_test['Cabin_reduced'].unique()
    if x not in X_train['Cabin_reduced'].unique()
]

len(unique_to_test_set)

0

In [22]:
# The impact of cardinality on the performance of machine learning models