In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'titanic:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3136%2F26502%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240629%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240629T170755Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5d65c86f107fbec5298bfeed1fe2a8a3be0f9640746bed04b946490ddd8d69eb688ed3b6a92ebb73f550c754cb580ce9a512ff3fcdd35186da6f288601f1770cbee5789246898c5c8fbb913c97db349125b7dcac4b314f8032d47a3fd9c1e9a09e08520cd8db5d8f2ccb41b9c8d80c785f6a236dc9ef73deabd0a0028d53d2f36c0c9c81c912aa3bee945682d279abad94e1e4cc8863b0f56c79afc02e9082a881f99ad78241161f2afb6bc3a4c331b189bcf7c616bcb7572647f5bf5f2417082a9544516961045841210de6a8130b1f2ddb44c4fd354dc0384dd62aa263cc6b140ce5ea1cb14865bf20d100c73b4e1feda745f9e07c5c1757e0fdbf18edd9d9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Imports

In [None]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import category_encoders as ce

In [None]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')
df_sub = pd.read_csv('../input/titanic/gender_submission.csv')

# Preprocessing The Data

In [None]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Firstly, let's see which features could be used to train the ML model in predicting passenger survival. The passenger name (`Name`) probably can't be used right off the bat as each passenger have a unique name (most likely, can be verified) and having the number of categories equal to the number of entries in the dataset may not be useful for the model. A possible way to incorporate the passenger name as a feature will be to encode it into say a binary category, where `1` indicates that the passenger is a person of status (e.g. monarchy, president, etc) and `0` otherwise.

The ticket number `Ticket` is similar in that sense, and may be dropped for initial ML modeling.

In [None]:
df_train.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [None]:
df_test.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [None]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [None]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,34.5,0,0,7.8292,,Q
1,3,female,47.0,1,0,7.0,,S
2,2,male,62.0,0,0,9.6875,,Q
3,3,male,27.0,0,0,8.6625,,S
4,3,female,22.0,1,1,12.2875,,S


Next, we encode the categorical features which are strings.For which we use Scikit-learn's `LabelEncoder`:

In [None]:
df_train['Cabin'] = df_train['Cabin'].replace(np.NaN, 'UNKNOWN', regex=True)
df_test['Cabin'] = df_test['Cabin'].replace(np.NaN, 'UNKNOWN', regex=True)

In [None]:
cabin_vals = np.unique(list(df_train['Cabin'].values) + list(df_test['Cabin'].values))
cabin_vals

array(['A10', 'A11', 'A14', 'A16', 'A18', 'A19', 'A20', 'A21', 'A23',
       'A24', 'A26', 'A29', 'A31', 'A32', 'A34', 'A36', 'A5', 'A6', 'A7',
       'A9', 'B10', 'B101', 'B102', 'B11', 'B18', 'B19', 'B20', 'B22',
       'B24', 'B26', 'B28', 'B3', 'B30', 'B35', 'B36', 'B37', 'B38',
       'B39', 'B4', 'B41', 'B42', 'B45', 'B49', 'B5', 'B50',
       'B51 B53 B55', 'B52 B54 B56', 'B57 B59 B63 B66', 'B58 B60', 'B61',
       'B69', 'B71', 'B73', 'B77', 'B78', 'B79', 'B80', 'B82 B84', 'B86',
       'B94', 'B96 B98', 'C101', 'C103', 'C104', 'C105', 'C106', 'C110',
       'C111', 'C116', 'C118', 'C123', 'C124', 'C125', 'C126', 'C128',
       'C130', 'C132', 'C148', 'C2', 'C22 C26', 'C23 C25 C27', 'C28',
       'C30', 'C31', 'C32', 'C39', 'C45', 'C46', 'C47', 'C49', 'C50',
       'C51', 'C52', 'C53', 'C54', 'C55 C57', 'C6', 'C62 C64', 'C65',
       'C68', 'C7', 'C70', 'C78', 'C80', 'C82', 'C83', 'C85', 'C86',
       'C87', 'C89', 'C90', 'C91', 'C92', 'C93', 'C95', 'C97', 'C99', 'D',
       'D

In [None]:
mapping_d = {}
for i, feature in enumerate(cabin_vals):
    mapping_d[feature] = i

encoder= ce.OrdinalEncoder(cols=['Cabin'], return_df=True,
                           mapping=[{'col':'Cabin',
                                     'mapping': mapping_d}])

In [None]:
mapping_d

{'A10': 0,
 'A11': 1,
 'A14': 2,
 'A16': 3,
 'A18': 4,
 'A19': 5,
 'A20': 6,
 'A21': 7,
 'A23': 8,
 'A24': 9,
 'A26': 10,
 'A29': 11,
 'A31': 12,
 'A32': 13,
 'A34': 14,
 'A36': 15,
 'A5': 16,
 'A6': 17,
 'A7': 18,
 'A9': 19,
 'B10': 20,
 'B101': 21,
 'B102': 22,
 'B11': 23,
 'B18': 24,
 'B19': 25,
 'B20': 26,
 'B22': 27,
 'B24': 28,
 'B26': 29,
 'B28': 30,
 'B3': 31,
 'B30': 32,
 'B35': 33,
 'B36': 34,
 'B37': 35,
 'B38': 36,
 'B39': 37,
 'B4': 38,
 'B41': 39,
 'B42': 40,
 'B45': 41,
 'B49': 42,
 'B5': 43,
 'B50': 44,
 'B51 B53 B55': 45,
 'B52 B54 B56': 46,
 'B57 B59 B63 B66': 47,
 'B58 B60': 48,
 'B61': 49,
 'B69': 50,
 'B71': 51,
 'B73': 52,
 'B77': 53,
 'B78': 54,
 'B79': 55,
 'B80': 56,
 'B82 B84': 57,
 'B86': 58,
 'B94': 59,
 'B96 B98': 60,
 'C101': 61,
 'C103': 62,
 'C104': 63,
 'C105': 64,
 'C106': 65,
 'C110': 66,
 'C111': 67,
 'C116': 68,
 'C118': 69,
 'C123': 70,
 'C124': 71,
 'C125': 72,
 'C126': 73,
 'C128': 74,
 'C130': 75,
 'C132': 76,
 'C148': 77,
 'C2': 78,
 'C22 C26':

In [None]:
df_train = encoder.fit_transform(df_train)

  elif pd.api.types.is_categorical(cols):


In [None]:
df_test = encoder.fit_transform(df_test)

In [None]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,186,S
1,1,1,female,38.0,1,0,71.2833,106,C
2,1,3,female,26.0,0,0,7.925,186,S
3,1,1,female,35.0,1,0,53.1,70,S
4,0,3,male,35.0,0,0,8.05,186,S


In [None]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,34.5,0,0,7.8292,186,Q
1,3,female,47.0,1,0,7.0,186,S
2,2,male,62.0,0,0,9.6875,186,Q
3,3,male,27.0,0,0,8.6625,186,S
4,3,female,22.0,1,1,12.2875,186,S


For the other categorical variables, there are no null values from the EDA. So we can proceed to encode them with Scikit-learn's `LabelEncoder`:

In [None]:
cols_to_encode = ['Sex', 'Embarked']
for col in cols_to_encode:
    le = LabelEncoder()
    le.fit(df_train[col])

    df_train[col] = le.transform(df_train[col])
    df_test[col] = le.transform(df_test[col])

In [None]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,186,2
1,1,1,0,38.0,1,0,71.2833,106,0
2,1,3,0,26.0,0,0,7.925,186,2
3,1,1,0,35.0,1,0,53.1,70,2
4,0,3,1,35.0,0,0,8.05,186,2


In [None]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,34.5,0,0,7.8292,186,1
1,3,0,47.0,1,0,7.0,186,2
2,2,1,62.0,0,0,9.6875,186,1
3,3,1,27.0,0,0,8.6625,186,2
4,3,0,22.0,1,1,12.2875,186,2


Note that there are missing values in `Age` and `Fare`. For such cases, we impute them with `-1`.

In [None]:
df_train['Age'] = df_train['Age'].replace(np.NaN, -1)
df_test['Age'] = df_test['Age'].replace(np.NaN, -1)
df_train['Fare'] = df_train['Fare'].replace(np.NaN, -1)
df_test['Fare'] = df_test['Fare'].replace(np.NaN, -1)

Some sanity check...

In [None]:
df_train[df_train.isna().any(axis=1)]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked


In [None]:
df_test[df_test.isna().any(axis=1)]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked


# Feature Engineering

## Stratify Age Into Groups

Looking at our [EDA](https://www.kaggle.com/khoongweihao/part-1-exploratory-data-analysis-eda) for `Age`, we can see that most of the passengers are between 18-30 years of age. We can generate new features that group the passengers into different age groups.

In [None]:
def age_group(x):
    if x < 18:
        return 'under_18'
    elif x >= 18 and x <=30:
        return '18_to_30'
    else:
        return 'above_30'

In [None]:
df_train['Age_Group'] = df_train['Age'].apply(lambda x: age_group(x))
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_Group
0,0,3,1,22.0,1,0,7.25,186,2,18_to_30
1,1,1,0,38.0,1,0,71.2833,106,0,above_30
2,1,3,0,26.0,0,0,7.925,186,2,18_to_30
3,1,1,0,35.0,1,0,53.1,70,2,above_30
4,0,3,1,35.0,0,0,8.05,186,2,above_30


In [None]:
df_test['Age_Group'] = df_test['Age'].apply(lambda x: age_group(x))
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_Group
0,3,1,34.5,0,0,7.8292,186,1,above_30
1,3,0,47.0,1,0,7.0,186,2,above_30
2,2,1,62.0,0,0,9.6875,186,1,above_30
3,3,1,27.0,0,0,8.6625,186,2,18_to_30
4,3,0,22.0,1,1,12.2875,186,2,18_to_30


Note that we have to encode again!

In [None]:
cols_to_encode = ['Age_Group']
for col in cols_to_encode:
    le = LabelEncoder()
    le.fit(df_train[col])

    df_train[col] = le.transform(df_train[col])
    df_test[col] = le.transform(df_test[col])

In [None]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_Group
0,0,3,1,22.0,1,0,7.25,186,2,0
1,1,1,0,38.0,1,0,71.2833,106,0,1
2,1,3,0,26.0,0,0,7.925,186,2,0
3,1,1,0,35.0,1,0,53.1,70,2,1
4,0,3,1,35.0,0,0,8.05,186,2,1


In [None]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_Group
0,3,1,34.5,0,0,7.8292,186,1,1
1,3,0,47.0,1,0,7.0,186,2,1
2,2,1,62.0,0,0,9.6875,186,1,1
3,3,1,27.0,0,0,8.6625,186,2,0
4,3,0,22.0,1,1,12.2875,186,2,0


# More Preprocessing

Before we begin to train our model for the classification task, we first scale the data with Scikit-learn's `StandardScaler`. And before that, we extract the target column from `df_train`.

In [None]:
y = df_train['Survived']
#df_train.drop('Survived', axis=1, inplace=True)

Now we are ready to scale the data and use them for model training.

In [None]:
cols_to_scale = ['Pclass', 'Age', 'Fare', 'Cabin', 'Embarked', 'Age_Group']
for col in cols_to_scale:
    sc = MinMaxScaler()
    df_train[col] = sc.fit_transform(df_train[col].values.reshape(-1,1))
    sc = MinMaxScaler()
    df_test[col] = sc.fit_transform(df_test[col].values.reshape(-1,1))

In [None]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_Group
0,0,1.0,1,0.283951,1,0,0.014151,1.0,0.666667,0.0
1,1,0.0,0,0.481481,1,0,0.139136,0.569892,0.0,0.5
2,1,1.0,0,0.333333,0,0,0.015469,1.0,0.666667,0.0
3,1,0.0,0,0.444444,1,0,0.103644,0.376344,0.666667,0.5
4,0,1.0,1,0.444444,0,0,0.015713,1.0,0.666667,0.5


In [None]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_Group
0,1.0,1,0.461039,0,0,0.0172,1.0,0.5,0.5
1,1.0,0,0.623377,1,0,0.015585,1.0,1.0,0.5
2,0.5,1,0.818182,0,0,0.02082,1.0,0.5,0.5
3,1.0,1,0.363636,0,0,0.018823,1.0,1.0,0.0
4,1.0,0,0.298701,1,1,0.025885,1.0,1.0,0.0


# Save Preprocessed Datasets For Modeling

In [None]:
df_train.to_csv('train_preprocessed.csv', index=False)
df_test.to_csv('test_preprocessed.csv', index=False)

# Finishing Remarks

Thanks for reading and I welcome your feedback and suggestions for improvement. The notebook will be updated periodically as well.

Happy Kaggling!

---------------------------------------------------------------------
My notebooks in this series can be found in the links below:
- [Exploratory Data Analysis (EDA)](https://www.kaggle.com/khoongweihao/part-1-exploratory-data-analysis-eda)
- [Preprocessing & Feature Engineering](https://www.kaggle.com/khoongweihao/part-2-preprocessing-feature-engineering)
- [Model Training & Validation Strategies](https://www.kaggle.com/khoongweihao/part-3-model-training-validation-strategies)
- [Inference & Post-processing Techniques](https://www.kaggle.com/khoongweihao/part-4-inference-and-post-processing-techniques)

Bonus notebooks include adoption of recent research in terms of models, hyperparameter search, etc. They can be found in the links below:
- Hyperparameter optimization with Optuna
- TabNet