# Spaceship Titanic Dataset with XGBoost


# Import the packages


In [1]:
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from IPython.display import display

from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

from xgboost import XGBClassifier

import optuna

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm
  plt.style.use("seaborn-whitegrid")


In [2]:
# PATH = "/kaggle/input/spaceship-titanic/"
PATH = ""

1. Sleep and spent
2. Deck and number of survivers
3. Group and Destination/HomePlanet


# 1 - Date preprocessing


In [3]:
def load_data(PATH):
    # Read data
    data_dir = Path(PATH)
    df_train = pd.read_csv(data_dir / "train.csv")
    id_train = df_train.index
    df_test = pd.read_csv(data_dir / "test.csv")
    id_test = df_test.index + len(id_train)
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test], ignore_index=True)
    # Cleaning
    df = clean(df)
    df = encode(df)
    # Reform splits
    df_train = df.loc[id_train, :]
    df_test = df.loc[id_test, :]
    return df_train, df_test

## Clean data


In [4]:
def bool_to_int(x):
    if str(x) == "True":
        return 1
    elif str(x) == "False":
        return 0
    else:
        return x


def clean(df):
    df["Transported"] = df["Transported"].astype("bool")

    return df

## Encode


In [5]:
def encode(df):
    features_nom = df.select_dtypes(exclude=["number", "bool"])

    for name in features_nom:
        df[name] = df[name].astype("category")

        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    return df

## Impute values


In [6]:
def label_encode_keeping_nulls(df):
    df_encoded = df.copy()
    for col in df.select_dtypes("category").columns:
        indx = df.loc[~df[col].isna(), col].index
        df_encoded[col] = df[col].astype("object")
        df_encoded.loc[indx, col] = df[col].cat.codes[indx]
    return df_encoded


def impute_using_group(df):
    df_groups = df.copy()

    df_groups["Group"] = df_groups.PassengerId.str[:4]
    df_groups = df_groups[df_groups.groupby(["Group"])["Group"].transform("count") > 1]
    df_encoded = label_encode_keeping_nulls(df_groups)

    features_impute = ["Cabin", "Destination", "HomePlanet"]
    for feature in features_impute:
        features_knn = ["Group", feature]
        df_knn_encoded = df_encoded[features_knn]
        knn_imputer = KNNImputer(n_neighbors=1, weights="distance")
        df_knn_imputed = pd.DataFrame(
            knn_imputer.fit_transform(df_knn_encoded),
            columns=df_knn_encoded.columns,
            index=df_knn_encoded.index,
        )
        df_knn_imputed = df_knn_imputed.round(0)

        for name in features_knn[1:]:
            cat_items = dict(enumerate(df[name].cat.categories))
            df_knn_imputed[name] = df_knn_imputed[name].replace(cat_items)
            idx = df_groups[name].isna().index
            df.loc[idx, name] = df_knn_imputed.loc[idx, name]

    return df


def knn_impute(df, df_train=None):
    df_encoded = label_encode_keeping_nulls(df)

    std_scaler = StandardScaler()
    df_scaled = df_encoded.copy()
    df_scaled[df_scaled.columns] = std_scaler.fit_transform(df_encoded)

    knn_imputer = KNNImputer(weights="distance")
    knn_cat_features = ["VIP", "CryoSleep", "HomePlanet", "Destination"]
    if df_train is not None:
        knn_imputer.fit(df_scaled.loc[df_train.index])
    else:
        knn_imputer.fit(df_scaled)
    df_scaled[df_scaled.columns] = knn_imputer.transform(df_scaled)

    df_encoded[df_encoded.columns] = std_scaler.inverse_transform(df_scaled)

    df_encoded = df_encoded.round(0)
    for name in knn_cat_features:
        df_encoded[name] = (
            df_encoded[name].astype("category").cat.add_categories("None")
        )
        new_categories = df[name].astype("category").cat.categories
        df[name] = (
            df_encoded[name].astype("category").cat.rename_categories(new_categories)
        )

    num_features = df.select_dtypes("number").columns
    df[num_features] = df_encoded[num_features]

    return df


def impute(df_train, df_test=None):
    df = df_train.copy()
    if df_test is not None:
        df = pd.concat([df, df_test])
    y = df.pop(target)
    df = impute_using_group(df)
    feature_bfills = ["Cabin"]
    df[feature_bfills] = df[feature_bfills].fillna(method="bfill")
    if df_test is not None:
        df = knn_impute(df, df_train)
    else:
        df = knn_impute(df)
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0).astype("int")

    for name in df.select_dtypes(exclude=["number"]):
        if df[name].isna().sum() != 0:
            df[name] = df[name].fillna("None")
        else:
            df[name] = df[name].cat.remove_categories("None")

    df = pd.concat([df, y], axis=1)
    if df_test is not None:
        return df.loc[df_train.index, :], df.loc[df_test.index, :]

    return df

# 2 - Features engineering


In [7]:
def mathematical_transforms(df):
    X = pd.DataFrame()
    X["RS_FC_SM"] = df[["RoomService", "FoodCourt", "ShoppingMall"]].sum(axis=1)
    X["VR_SPA"] = df[["Spa", "VRDeck"]].sum(axis=1)
    X["Overall_spent"] = X["RS_FC_SM"] + X["VR_SPA"]
    X["Ratio_VR_SPA"] = X["VR_SPA"] / X["Overall_spent"]
    X["Ratio_RFS"] = X["RS_FC_SM"] / X["Overall_spent"]

    X["Sleep_Side"] = df["CryoSleep"].cat.codes * df["Side"].cat.codes

    return X


def interactions(df):
    X = pd.DataFrame()

    dummies_sleep = pd.get_dummies(df.CryoSleep, prefix="CryoSleep_VR_SPA")
    sleep_VR_SPA = dummies_sleep.mul(df.VR_SPA, axis=0)

    dummies_sleep = pd.get_dummies(df.CryoSleep, prefix="CryoSleep_RFM")
    sleep_RFS = dummies_sleep.mul(df.RS_FC_SM, axis=0)

    X = pd.concat([sleep_VR_SPA, sleep_RFS], axis=1)

    return X


def break_down(df):
    X = pd.DataFrame()

    X[["Group", "Id"]] = df.PassengerId.str.split("_", expand=True)
    X["Group"] = X.Group.astype("int")
    X.pop("Id")

    X[["Deck", "Cabin_num", "Side"]] = df.Cabin.str.split("/", expand=True)
    X[["Deck", "Side"]] = X[["Deck", "Side"]].fillna("None")
    X.pop("Cabin_num")

    X[["First Name", "Surname"]] = df.Name.str.split(" ", expand=True).fillna("None")
    X.pop("First Name")
    for name in ["Deck", "Side", "Surname"]:
        X[name] = X[name].astype("category")

        if "None" not in X[name].cat.categories:
            X[name] = X[name].cat.add_categories("None")

    return X


def group_transform(df):
    X = pd.DataFrame()

    X["Diff_VR_SPA"] = df["VR_SPA"] - df.groupby(by=["CryoSleep"])["VR_SPA"].transform(
        "median"
    )
    X["Diff_RS_FC_SM"] = df["RS_FC_SM"] - df.groupby(by=["CryoSleep"])[
        "RS_FC_SM"
    ].transform("median")

    X["in_group"] = (df.groupby(["Group"])["Group"].transform("count") > 1).astype(
        "int"
    )

    return X

In [8]:
def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    return X


def create_dummies(df, features):
    X = pd.DataFrame()
    for name in features:
        X = pd.concat([X, pd.get_dummies(df[name], prefix=name).astype("int")], axis=1)
    return X

## K-Means clustering


In [9]:
def cluster_labels(df, features, n_clusters=20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0)
    X_new = pd.DataFrame()
    X_new["Clusters"] = kmeans.fit_predict(X_scaled)
    return X_new

# Create final feature set


In [10]:
def create_features(df, df_test=None):
    X = df.copy()
    y = X.pop(target)

    # Combine splits if test data is given
    #
    # If we're creating features for test set predictions, we should
    # use all the data we have available. After creating our features,
    # we'll recreate the splits.
    if df_test is not None:
        X_test = df_test.copy()
        if target in X_test.columns:
            X_test.pop(target)
        X = pd.concat([X, X_test])

    # Transformations
    X = X.join(break_down(X))
    X = X.join(mathematical_transforms(X))
    X = X.join(interactions(X))
    X = X.join(group_transform(X))

    X = label_encode(X)

    # Clustering
    cluster_features = [
        "CryoSleep",
        "Side",
        "Deck",
        "FoodCourt",
        "ShoppingMall",
        "Spa",
        "VRDeck",
    ]

    X = X.join(cluster_labels(X, cluster_features, n_clusters=10))

    # Create dummies
    features = ["Deck", "HomePlanet", "Destination", "Side"]
    X = X.join(create_dummies(X, features))
    X.drop(features, axis=1, inplace=True)

    # Drop categorical features that can be hardly match between train and test sets
    X.drop(["PassengerId", "Name", "Cabin", "Group", "Surname"], axis=1, inplace=True)

    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)

    if df_test is not None:
        return X, X_test
    else:
        return X

# Cross validation


In [11]:
def score_dataset(X, y, model=XGBClassifier(), features=None):
    score = []
    skf = StratifiedKFold(random_state=0, shuffle=True)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train, X_test = impute(X_train, X_test)
        X_train, X_test = create_features(X_train, X_test)
        if features is not None:
            X_train, X_test = X_train[features], X_test[features]
        model_local = clone(model)
        model_local.fit(X_train, y_train)
        score.append(model_local.score(X_test, y_test))

    return score

# Feature selection


In [12]:
target = "Transported"

In [13]:
# optuna.logging.set_verbosity(optuna.logging.WARNING)


def objective(trial):
    df_train, _ = load_data(PATH)
    y_train = df_train.loc[:, target].astype("bool")

    xgb_params = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 50, 5000),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e1, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e1, log=True),
    )
    xgb = XGBClassifier(**xgb_params)

    return np.mean(score_dataset(df_train, y_train, xgb))


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
xgb_params = study.best_params

[I 2023-08-13 12:42:52,317] A new study created in memory with name: no-name-7103b35b-a896-42c3-9cf7-c8056e2e61eb
[I 2023-08-13 12:43:08,388] Trial 0 finished with value: 0.8043252639805292 and parameters: {'max_depth': 8, 'learning_rate': 0.026006211884542604, 'n_estimators': 611, 'min_child_weight': 7, 'colsample_bytree': 0.5082371597267532, 'subsample': 0.7965492062473178, 'reg_alpha': 0.012132052776496049, 'reg_lambda': 0.0006879316539184618}. Best is trial 0 with value: 0.8043252639805292.
[I 2023-08-13 12:43:55,708] Trial 1 finished with value: 0.8040945188265415 and parameters: {'max_depth': 10, 'learning_rate': 0.003280073588745461, 'n_estimators': 3823, 'min_child_weight': 7, 'colsample_bytree': 0.9510735348535224, 'subsample': 0.36310592535748437, 'reg_alpha': 8.651409573851279, 'reg_lambda': 2.136891976350812}. Best is trial 0 with value: 0.8043252639805292.
[I 2023-08-13 12:44:29,311] Trial 2 finished with value: 0.8057049704504593 and parameters: {'max_depth': 3, 'learning

In [14]:
print("The best parameters")
xgb_params

The best parameters


{'max_depth': 7,
 'learning_rate': 0.0017501003141621718,
 'n_estimators': 1093,
 'min_child_weight': 3,
 'colsample_bytree': 0.6763166567046347,
 'subsample': 0.5608526740013424,
 'reg_alpha': 0.23526952517465713,
 'reg_lambda': 0.01505626133733047}

In [15]:
df_train, _ = load_data(PATH)
y_train = df_train.loc[:, target].astype("bool")

# xgb_params = {
#     "max_depth": 3,
#     "learning_rate": 0.05772117276721077,
#     "n_estimators": 195,
#     "min_child_weight": 7,
#     "colsample_bytree": 0.538747301431327,
#     "subsample": 0.39029351866832696,
#     "reg_alpha": 0.45766359119706124,
#     "reg_lambda": 0.13300367058437046,
# }

xgb = XGBClassifier(**xgb_params)

print(np.mean(score_dataset(df_train, y_train, xgb)))

0.8091558909495887


In [16]:
from sklearn.model_selection import train_test_split

X, _ = load_data(PATH)
y = X.loc[:, target].to_numpy()

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, shuffle=True, random_state=0, test_size=0.25
)

X_train, X_val = impute(X_train, X_val)
X_train, X_val = create_features(X_train, X_val)

# xgb_params = {
#     "max_depth": 3,
#     "learning_rate": 0.009227844570475341,
#     "n_estimators": 3528,
#     "min_child_weight": 4,
#     "colsample_bytree": 0.9085390023706208,
#     "subsample": 0.34580167629578484,
#     "reg_alpha": 0.000349825137558448,
#     "reg_lambda": 0.2982102984707772,
# }

xgb = XGBClassifier(**xgb_params)
xgb.fit(X_train, y_train)


from sklearn.inspection import permutation_importance

r = permutation_importance(
    xgb, X_val, y_val, n_repeats=20, random_state=0, scoring="f1"
)
most_predictive = []
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        most_predictive.append(X_train.columns[i])
        print(
            f"{X_train.columns[i]:<20}"
            f"{r.importances_mean[i]:.3f}"
            f" +/- {r.importances_std[i]:.3f}"
        )

Overall_spent       0.091 +/- 0.007
Ratio_VR_SPA        0.033 +/- 0.004
RoomService         0.028 +/- 0.005
FoodCourt           0.012 +/- 0.002
ShoppingMall        0.008 +/- 0.002
Ratio_RFS           0.006 +/- 0.002
Age                 0.005 +/- 0.002
VR_SPA              0.004 +/- 0.001
Sleep_Side          0.003 +/- 0.001
Deck_2              0.003 +/- 0.001
CryoSleep_VR_SPA_False0.002 +/- 0.001


# Hyperparameter Tuning


In [17]:
# optuna.logging.set_verbosity(optuna.logging.WARNING)


def objective(trial):
    df_train, _ = load_data(PATH)
    y_train = df_train.loc[:, target].astype("bool")

    xgb_params = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 50, 5000),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e1, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e1, log=True),
    )
    xgb = XGBClassifier(**xgb_params)

    return np.mean(score_dataset(df_train, y_train, xgb, most_predictive))


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
xgb_params = study.best_params

[I 2023-08-13 12:54:18,331] A new study created in memory with name: no-name-61e29923-7ae9-4f38-ad32-89c3623b18b1
[I 2023-08-13 12:55:23,523] Trial 0 finished with value: 0.7959271859083332 and parameters: {'max_depth': 8, 'learning_rate': 0.0013016903635860314, 'n_estimators': 4826, 'min_child_weight': 3, 'colsample_bytree': 0.6205668277824345, 'subsample': 0.7060096255664352, 'reg_alpha': 0.27197146106142883, 'reg_lambda': 3.072614933681927}. Best is trial 0 with value: 0.7959271859083332.
[I 2023-08-13 12:55:51,901] Trial 1 finished with value: 0.7749899251649858 and parameters: {'max_depth': 7, 'learning_rate': 0.062142740553048574, 'n_estimators': 3396, 'min_child_weight': 1, 'colsample_bytree': 0.2478119072592147, 'subsample': 0.32670636414466203, 'reg_alpha': 0.004426514624991621, 'reg_lambda': 0.12287189215802419}. Best is trial 0 with value: 0.7959271859083332.
[I 2023-08-13 12:56:53,654] Trial 2 finished with value: 0.7945469500546258 and parameters: {'max_depth': 10, 'learni

In [18]:
print("The best parameters")
xgb_params

The best parameters


{'max_depth': 8,
 'learning_rate': 0.0011906903175755818,
 'n_estimators': 4817,
 'min_child_weight': 4,
 'colsample_bytree': 0.7017095558726285,
 'subsample': 0.7320179782809897,
 'reg_alpha': 7.481441915788187,
 'reg_lambda': 9.170063107558233}

In [19]:
df_train, _ = load_data(PATH)
y_train = df_train.loc[:, target].astype("bool")

# xgb_params = {
#     "max_depth": 3,
#     "learning_rate": 0.05772117276721077,
#     "n_estimators": 195,
#     "min_child_weight": 7,
#     "colsample_bytree": 0.538747301431327,
#     "subsample": 0.39029351866832696,
#     "reg_alpha": 0.45766359119706124,
#     "reg_lambda": 0.13300367058437046,
# }

xgb = XGBClassifier(**xgb_params)

print(np.mean(score_dataset(df_train, y_train, xgb, features=most_predictive)))

0.797077470683719


# 3- Train model and create submission


In [20]:
df_train, df_test = load_data(PATH)
target = "Transported"
X_train, X_test = impute(df_train, df_test)
X_train, X_test = create_features(X_train, X_test)
X_train, X_test = X_train[most_predictive], X_test[most_predictive]
y_train = df_train.loc[:, target].to_numpy()

xgb = XGBClassifier(**xgb_params)
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test).astype("bool")

output = pd.DataFrame(
    {"PassengerId": df_test.PassengerId, "Transported": predictions.squeeze()}
)

output.to_csv("submission.csv", index=False)