# Preprocessing

In [62]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression

In [138]:
def preprocess_data(df, is_Train = True):

    # Name and Title
    df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand = False)
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4}
    df["Title"] = df["Title"].map(title_mapping).fillna(0).astype(int)

    # Sex
    df["Sex"] = df["Sex"].map({"male": 1, "female": 0}).astype(int)

    # Age
    # We would fill the missing values usings regression on Title + Sex + Ticket Class
    # Then we would map Age to 1 if Age is in the group 0 - 9
    title_dummies = pd.get_dummies(df["Title"], prefix = "Title", drop_first = True)
    pclass_dummies = pd.get_dummies(df["Pclass"], prefix = "Pclass", drop_first = True)
    df = pd.concat([df, title_dummies, pclass_dummies], axis = 1)
    regress_features = ["Sex"] + [col for col in df.columns if col.startswith("Title_")] + [col for col in df.columns if col.startswith("Pclass_")]
    train_data = df[df["Age"].notna()]
    test_data = df[df["Age"].isna()]
    model = LinearRegression()
    model.fit(train_data[regress_features], train_data["Age"])
    df.loc[df["Age"].isna(), "Age"] = model.predict(test_data[regress_features])
    df["Age"] = df["Age"].map(lambda x: 1 if x < 10 else 0)

    # SibSp & Parch
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

    # Fare
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())
    df["LogFare"] = df["Fare"].apply(lambda x: np.log(x) if x > 0 else 0)
    df["FareGroup"] = pd.qcut(df["Fare"], 5)
    labelEncoder = LabelEncoder()
    df["FareGroup"] = labelEncoder.fit_transform(df["FareGroup"])


    # Embarked
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
    df["Embarked"] = df["Embarked"].map({"S": 1, "C": 2, "Q": 3}).astype(int)

    # Others
    if is_Train:
        df = df[["PassengerId", "Survived","Sex", "FareGroup"]]
    else:
        df = df[["PassengerId", "Sex", "FareGroup"]]
    return df

In [139]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df, False)

In [140]:
x_train = train_df.drop(columns = ["PassengerId", "Survived"], axis = 1)
x_test = test_df.drop("PassengerId", axis = 1)
y_train = train_df["Survived"]

#scaler = StandardScaler()
#x_train = scaler.fit_transform(x_train)
#x_test = scaler.transform(x_test)

In [141]:
train_df.to_pickle("../metadata/train.pkl")
test_df.to_pickle("../metadata/test.pkl")
pd.DataFrame(x_train).to_pickle("../metadata/x_train.pkl")
pd.DataFrame(x_test).to_pickle("../metadata/x_test.pkl")
pd.DataFrame(y_train).to_pickle("../metadata/y_train.pkl")