# Spaceship Titanic Dataset with XGBoost


# Import the packages


In [31]:
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from IPython.display import display
from pandas.api.types import CategoricalDtype

from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.impute import KNNImputer


from xgboost import XGBClassifier

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings("ignore")

In [32]:
# PATH = "/kaggle/input/spaceship-titanic/"
PATH = ""

1. Sleep and spent
2. Deck and number of survivers


# 1 - Date preprocessing


In [33]:
def load_data(PATH):
    # Read data
    data_dir = Path(PATH)
    df_train = pd.read_csv(data_dir / "train.csv")
    id_train = df_train.index
    df_test = pd.read_csv(data_dir / "test.csv")
    id_test = df_test.index + len(id_train)
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test], ignore_index=True)
    # Cleaning
    df = clean(df)
    df = encode(df)
    # Reform splits
    df_train = df.loc[id_train, :]
    df_test = df.loc[id_test, :]
    return df_train, df_test

## Clean data


In [34]:
def bool_to_int(x):
    if str(x) == "True":
        return 1
    elif str(x) == "False":
        return 0
    else:
        return x


def clean(df):
    df["Transported"] = df["Transported"].astype("bool")

    return df

## Encode


In [35]:
def encode(df):
    features_nom = df.select_dtypes(exclude=["number", "bool"])

    for name in features_nom:
        df[name] = df[name].astype("category")

        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    return df

## Imput values


In [45]:
def label_encode_keeping_nulls(df):
    df_encoded = df.copy()
    for col in df.select_dtypes("category").columns:
        indx = df.loc[~df[col].isna(), col].index
        categories = df[col].cat.categories
        df_encoded[col] = df[col].astype("object")
        df_encoded.loc[indx, col] = df[col].cat.codes[indx]
    return df_encoded


from sklearn.preprocessing import StandardScaler


def knn_impute(df, df_train=None):
    df_encoded = label_encode_keeping_nulls(df)

    std_scaler = StandardScaler()
    df_scaled = df_encoded.copy()
    df_scaled[df_scaled.columns] = std_scaler.fit_transform(df_encoded)

    knn_imputer = KNNImputer()
    knn_cat_features = ["VIP", "CryoSleep"]
    if df_train is not None:
        knn_imputer.fit(df_scaled.loc[df_train.index])
    else:
        knn_imputer.fit(df_scaled)
    df_scaled[df_scaled.columns] = knn_imputer.transform(df_scaled)

    df_encoded[df_encoded.columns] = std_scaler.inverse_transform(df_scaled)

    df_encoded = df_encoded.round(0)
    for name in knn_cat_features:
        df_encoded[name] = (
            df_encoded[name].astype("category").cat.add_categories("None")
        )
        new_categories = df[name].astype("category").cat.categories
        df[name] = (
            df_encoded[name].astype("category").cat.rename_categories(new_categories)
        )

    num_features = df.select_dtypes("number").columns
    df[num_features] = df_encoded[num_features]

    return df


def impute(df_train, df_test=None):
    df = df_train.copy()
    if df_test is not None:
        df = pd.concat([df, df_test])
    y = df.pop(target)

    feature_bfills = ["Cabin", "HomePlanet", "Destination"]
    df[feature_bfills] = df[feature_bfills].fillna(method="bfill")

    if df_test is not None:
        df = knn_impute(df, df_train)
    else:
        df = knn_impute(df)

    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0).astype("int")

    for name in df.select_dtypes(exclude=["number"]):
        if df[name].isna().sum() != 0:
            df[name] = df[name].fillna("None")
        else:
            df[name] = df[name].cat.remove_categories("None")

    df = pd.concat([df, y], axis=1)
    if df_test is not None:
        return df.loc[df_train.index, :], df.loc[df_test.index, :]

    return df

# 2 - Features engineering


In [37]:
def mathematical_transforms(df):
    X = pd.DataFrame()
    X["RS_FC_SM"] = df[["RoomService", "FoodCourt", "ShoppingMall"]].sum(axis=1)
    df["VR_SPA"] = df[["Spa", "VRDeck"]].sum(axis=1)

    return X


def interactions(df):
    X = pd.DataFrame()

    dummies_sleep = pd.get_dummies(df.CryoSleep, prefix="CryoSleep_VR_SPA")
    sleep_VR_SPA = dummies_sleep.mul(df.VR_SPA, axis=0)

    dummies_sleep = pd.get_dummies(df.CryoSleep, prefix="CryoSleep_RFM")
    sleep_RFS = dummies_sleep.mul(df.RS_FC_SM, axis=0)

    X = pd.concat([sleep_VR_SPA, sleep_RFS], axis=1)

    return X


def counts(df):
    X = pd.DataFrame()
    return X


def break_down(df):
    X = pd.DataFrame()

    X[["Group", "Id"]] = df.PassengerId.str.split("_", expand=True)
    X["Group"] = X.Group.astype("int")
    X.pop("Id")

    X[["Deck", "Cabin_num", "Side"]] = df.Cabin.str.split("/", expand=True)
    X[["Deck", "Side"]] = X[["Deck", "Side"]].fillna("None")
    X["Cabin_num"] = X.Cabin_num.fillna(0).astype("int")

    X[["First Name", "Surname"]] = df.Name.str.split(" ", expand=True).fillna("None")
    X.pop("First Name")
    for name in ["Deck", "Side", "Surname"]:
        X[name] = X[name].astype("category")

        if "None" not in X[name].cat.categories:
            X[name] = X[name].cat.add_categories("None")

    return X


def group_transform(df):
    X = pd.DataFrame()

    X["Diff_VR_SPA"] = df["VR_SPA"] - df.groupby(by=["CryoSleep"])["VR_SPA"].transform(
        "median"
    )
    X["Diff_RS_FC_SM"] = df["RS_FC_SM"] - df.groupby(by=["CryoSleep"])[
        "RS_FC_SM"
    ].transform("median")

    return X

In [38]:
def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    return X

# k-Means Clustering


In [39]:
def cluster_labels(df, features, n_clusters=20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0)
    X_new = pd.DataFrame()
    X_new["Clusters"] = kmeans.fit_predict(X_scaled)
    return X_new


def cluster_distance(df, features, n_clusters=20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters=20, n_init=50, random_state=0)
    X_cd = kmeans.fit_transform(X_scaled)
    # Label features and join to dataset
    X_cd = pd.DataFrame(X_cd, columns=[f"Centroid_{i}" for i in range(X_cd.shape[1])])
    return X_cd

# Create final feature set


In [40]:
def create_features(df, df_test=None):
    X = df.copy()
    y = X.pop(target)
    #     y = df.loc[:, target]

    # Combine splits if test data is given
    #
    # If we're creating features for test set predictions, we should
    # use all the data we have available. After creating our features,
    # we'll recreate the splits.
    if df_test is not None:
        X_test = df_test.copy()
        if target in X_test.columns:
            X_test.pop(target)
        X = pd.concat([X, X_test])

    # Transformations
    X = X.join(break_down(X))
    X = X.join(mathematical_transforms(X))
    X = X.join(interactions(X))
    #     X = X.join(counts(X))
    X = X.join(group_transform(X))

    X.drop(["PassengerId", "Name", "Cabin", "Surname", "Group"], axis=1, inplace=True)

    # Clustering
    # cluster_features_1 = ["Spa", "VRDeck"]
    # X = X.join(cluster_labels(X, cluster_features_1, n_clusters=5))
    # X = X.join(cluster_distance(X, cluster_features, n_clusters=20))

    # PCA
    # pca_features = X.select_dtypes(exclude="category").columns
    # X = X.join(pca_inspired(X))
    # X = X.join(pca_components(X, pca_features))
    # X = X.join(indicate_outliers(X))

    X = label_encode(X)

    cluster_features = [
        "CryoSleep",
        "Side",
        "Deck",
        "FoodCourt",
        "ShoppingMall",
        "Spa",
        "VRDeck",
        "Deck",
    ]
    X = X.join(cluster_labels(X, cluster_features, n_clusters=10))

    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)

    # Target Encoder
    #     encoder = CrossFoldEncoder(MEstimateEncoder, m=1)
    #     X = X.join(encoder.fit_transform(X, y, cols=["MSSubClass"]))
    #     if df_test is not None:
    #         X_test = X_test.join(encoder.transform(X_test))

    if df_test is not None:
        return X, X_test
    else:
        return X

# Cross validation


In [41]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from pprint import pprint


def score_dataset(X, y, model=XGBClassifier()):
    score = []
    skf = StratifiedKFold(random_state=0, shuffle=True)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train, X_test = impute(X_train, X_test)
        X_train, X_test = create_features(X_train, X_test)

        model_local = clone(model)
        model_local.fit(X_train, y_train)
        score.append(model_local.score(X_test, y_test))

    return score

In [42]:
df_train, _ = load_data(PATH)
target = "Transported"
y_train = df_train.loc[:, target].astype("bool")


xgb_params = dict(
    max_depth=8,
    learning_rate=0.1,
    n_estimators=400,
    min_child_weight=2,
    colsample_bytree=0.4,
    subsample=0.6,
    reg_alpha=7,
    reg_lambda=5,
)
xgb = XGBClassifier(**xgb_params)

print(np.mean(score_dataset(df_train, y_train, xgb)))

0.8096161901440653


In [43]:
# df_train, _ = load_data(PATH)
# target = "Transported"
# y_train = df_train.loc[:, target].astype("bool")


# for p in np.arange(0, 10, 1):
# xgb_params = dict(
#     max_depth=8,
#     learning_rate=0.1,
#     n_estimators=400,
#     min_child_weight=2,
#     colsample_bytree=0.4,
#     subsample=0.6,
#     reg_alpha=7,
#     reg_lambda=5,
# )
#     xgb = XGBClassifier(**xgb_params)

#     print(p)
#     print(np.mean(score_dataset(df_train, y_train, xgb)))
#     print('--------------------------------')

# 3- Train model and create submission


In [44]:
df_train, df_test = load_data(PATH)
target = "Transported"
X_train, X_test = impute(df_train, df_test)
X_train, X_test = create_features(X_train, X_test)
y_train = df_train.loc[:, target].to_numpy()

xgb = XGBClassifier(**xgb_params)
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test).astype("bool")

output = pd.DataFrame(
    {"PassengerId": df_test.PassengerId, "Transported": predictions.squeeze()}
)

output.to_csv("submission.csv", index=False)