# Feature Engineering with Titanic dataset

First, import libraries and load and explore the data.

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Load the data
titanic_raw = pd.read_csv('./data/titanic_train.csv')

titanic_raw.head()

Feature engineering:

In [None]:
# Create a new feature "FamilySize" as a combination of SibSp and Parch
titanic = titanic_raw
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1

# Create new feature "IsAlone" from "FamilySize"
titanic['IsAlone'] = 0
titanic.loc[titanic['FamilySize'] == 1, 'IsAlone'] = 1

# Create "FarePerPerson" feature
titanic['FarePerPerson'] = titanic['Fare'] / titanic['FamilySize']

# Create a new feature "Title", this is extracted from the name feature
# We first define a function to extract titles from passenger names
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'
        
# Create a new "Title" feature
titanic['Title'] = titanic['Name'].apply(get_title)

# Simplify the titles, merge less common titles into the same category
titanic['Title'] = titanic['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Distinguished')
titanic['Title'] = titanic['Title'].replace('Mlle', 'Miss')
titanic['Title'] = titanic['Title'].replace('Ms', 'Miss')
titanic['Title'] = titanic['Title'].replace('Mme', 'Mrs')

# Convert "Title" into numerical values using one-hot encoding
one_hot = OneHotEncoder()
title_encoded = one_hot.fit_transform(titanic[['Title']]).toarray()
title_encoded_df = pd.DataFrame(title_encoded, columns=one_hot.get_feature_names_out(['Title']))
titanic = pd.concat([titanic, title_encoded_df], axis=1)

# Fill missing age values using median age
median_age = titanic['Age'].median()
titanic['Age'].fillna(median_age, inplace=True)

# Create "AgeGroup" feature
bins = [0, 10, 20, 30, 40, 50, 60, 70, np.inf]
labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70+']
titanic['AgeGroup'] = pd.cut(titanic['Age'], bins=bins, labels=labels)

# Convert "AgeGroup" into numerical values using one-hot encoding
age_group_encoded = one_hot.fit_transform(titanic[['AgeGroup']]).toarray()
age_group_encoded_df = pd.DataFrame(age_group_encoded, columns=one_hot.get_feature_names_out(['AgeGroup']))
titanic = pd.concat([titanic, age_group_encoded_df], axis=1)

# Fill missing cabin values with "U" for unknown
titanic['Cabin'].fillna('U', inplace=True)

# Create "CabinClass" feature
titanic['CabinClass'] = titanic['Cabin'].apply(lambda x: x[0])

# Convert CabinClass into numerical values using one-hot encoding
cabin_class_encoded = one_hot.fit_transform(titanic[['CabinClass']]).toarray()
cabin_class_encoded_df = pd.DataFrame(cabin_class_encoded, columns=one_hot.get_feature_names_out(['CabinClass']))
titanic = pd.concat([titanic, cabin_class_encoded_df], axis=1)

# Convert Sex into numerical values using one-hot encoding
sex_encoded = one_hot.fit_transform(titanic[['Sex']]).toarray()
sex_encoded_df = pd.DataFrame(sex_encoded, columns=one_hot.get_feature_names_out(['Sex']))
titanic = pd.concat([titanic, sex_encoded_df], axis=1)

# Convert Embarked into numerical values using one-hot encoding
embarked_encoded = one_hot.fit_transform(titanic[['Embarked']]).toarray()
embarked_encoded_df = pd.DataFrame(embarked_encoded, columns=one_hot.get_feature_names_out(['Embarked']))
titanic = pd.concat([titanic, embarked_encoded_df], axis=1)

# Drop irrelevant features
titanic = titanic.drop(['Name', 'Ticket', 'Title', 'Cabin', 'Sex', 'Embarked', 'AgeGroup', 'CabinClass'], axis=1)

titanic.head()
