# Preprocessing

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
def preprocess_data(df):

    # Name
    df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand = False)
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4}
    df["Title"] = df["Title"].map(title_mapping).fillna(0).astype(int)

    # Sex
    df["Sex"] = df["Sex"].map({"male": 1, "female": 0}).astype(int)

    # Age
    df["Age"] = df.groupby("Title")["Age"].transform(lambda x: x.fillna(x.median()))

    # SibSp & Parch
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

    # Fare
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())

    # Embarked
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
    df["Embarked"] = df["Embarked"].map({"S": 1, "C": 2, "Q": 3}).astype(int)

    # Others
    df = df.drop(columns = ["Name", "Ticket", "Cabin", "SibSp", "Parch"], axis = 1, errors = "ignore")

    return df

In [3]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

In [4]:
x_train = train_df.drop(columns = ["PassengerId", "Survived"], axis = 1)
x_test = test_df.drop("PassengerId", axis = 1)
y_train = train_df["Survived"]

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [5]:
train_df.to_pickle("../metadata/train.pkl")
test_df.to_pickle("../metadata/test.pkl")
pd.DataFrame(x_train).to_pickle("../metadata/x_train.pkl")
pd.DataFrame(x_test).to_pickle("../metadata/x_test.pkl")
pd.DataFrame(y_train).to_pickle("../metadata/y_train.pkl")