# Spaceship Titanic Dataset with XGBoost


# Import the packages


In [88]:
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from IPython.display import display
from pandas.api.types import CategoricalDtype

from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score

from xgboost import XGBClassifier

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings("ignore")

In [89]:
# PATH = "/kaggle/input/spaceship-titanic/"
PATH = ""

# 1 - Date preprocessing


In [195]:
def load_data(PATH):
    # Read data
    data_dir = Path(PATH)
    df_train = pd.read_csv(data_dir / "train.csv")
    id_train = df_train.index
    df_test = pd.read_csv(data_dir / "test.csv")
    id_test = df_test.index + len(id_train)
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test], ignore_index=True)
    # Cleaning
    df = clean(df)
    df = encode(df)
    # df = impute(df)
    # Reform splits
    df_train = df.loc[id_train, :]
    df_test = df.loc[id_test, :]
    return df_train, df_test

## Clean data


In [319]:
def bool_to_int(x):
    if str(x) == "True":
        return 1
    elif str(x) == "False":
        return 0
    else:
        return x


def clean(df):
    # df[["Transported", "CryoSleep", "VIP"]] = (
    #     df[["Transported", "CryoSleep", "VIP"]]
    #     .apply(lambda x: x.apply(bool_to_int))
    #     .astype("Int8")
    # )
    df["Transported"] = df["Transported"].astype("Int8")

    return df

## Encode


In [92]:
def encode(df):
    features_nom = df.select_dtypes(exclude="number")

    for name in features_nom:
        df[name] = df[name].astype("category")

        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    return df

## Imput values


In [367]:
df_train, df_test = load_data(PATH)

df_train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0,8693.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791,0.503624
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189,0.500016
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,38.0,47.0,76.0,27.0,59.0,46.0,1.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,1.0


In [368]:
X_train, X_test = impute(df_train, df_test)
# print(df.isna().sum())
# df.info()
X_train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,28.803865,224.002071,455.789026,174.057863,309.931209,303.188082
std,14.387553,660.899537,1597.315777,600.458174,1127.043346,1135.97171
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,55.0,89.0,32.0,70.0,53.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [363]:
from sklearn.impute import SimpleImputer, KNNImputer


def label_encode_keeping_nulls(df):
    df_encoded = df.copy()
    for col in df.select_dtypes("category").columns:
        indx = df.loc[~df[col].isna(), col].index
        categories = df[col].cat.categories
        df_encoded[col] = df[col].astype("object")
        df_encoded.loc[indx, col] = df[col].cat.codes[indx]
    return df_encoded


def knn_impute(df, df_train=None):
    df_encoded = label_encode_keeping_nulls(df)

    knn_imputer = KNNImputer()
    knn_cat_features = ["VIP", "CryoSleep"]
    if df_train is not None:
        knn_imputer.fit(df_encoded.loc[df_train.index])
    else:
        knn_imputer.fit(df_encoded)

    df_encoded[df_encoded.columns] = knn_imputer.transform(df_encoded)
    df_encoded = df_encoded.round(0)
    for name in knn_cat_features:
        df_encoded[name] = (
            df_encoded[name].astype("category").cat.add_categories("None")
        )
        new_categories = df[name].astype("category").cat.categories
        df[name] = (
            df_encoded[name].astype("category").cat.rename_categories(new_categories)
        )

    num_features = df.select_dtypes("number").columns
    df[num_features] = df_encoded[num_features]

    return df


def impute(df_train, df_test=None):
    df = df_train.copy()
    if df_test is not None:
        df = pd.concat([df, df_test])
    df.pop(target)

    feature_bfills = ["Cabin", "HomePlanet", "Destination"]
    df[feature_bfills] = df[feature_bfills].fillna(method="bfill")

    if df_test is not None:
        df = knn_impute(df, df_train)
    else:
        df = knn_impute(df)

    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0).astype("int")

    for name in df.select_dtypes(exclude=["number"]):
        df[name] = df[name].fillna("None")

    if df_test is not None:
        return df.loc[df_train.index, :], df.loc[df_test.index, :]

    return df

# 2 - Features engineering


In [102]:
def mathematical_transforms(df):
    X = pd.DataFrame()
    X["Overall_spent"] = df[
        ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    ].sum(axis=1)
    return X


def interactions(df):
    X = pd.DataFrame()
    return X


def counts(df):
    X = pd.DataFrame()
    return X


def break_down(df):
    X = pd.DataFrame()

    X[["Group", "Id"]] = df.PassengerId.str.split("_", expand=True)
    X["Group"] = X.Group.astype("int")
    X.pop("Id")

    X[["Deck", "Cabin_num", "Side"]] = df.Cabin.str.split("/", expand=True)
    X[["Deck", "Side"]] = X[["Deck", "Side"]].fillna("None")
    X["Cabin_num"] = X.Cabin_num.fillna(0).astype("int")

    X[["First Name", "Surname"]] = df.Name.str.split(" ", expand=True).fillna("None")
    X.pop("First Name")
    for name in ["Deck", "Side", "Surname"]:
        X[name] = X[name].astype("category")

        if "None" not in X[name].cat.categories:
            X[name] = X[name].cat.add_categories("None")

    return X


def group_transform(df):
    X = pd.DataFrame()

In [95]:
def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    return X

# Mutual information


In [96]:
from sklearn.feature_selection import mutual_info_regression


def make_mi_scores(X, y, index):
    X = X.loc[index, :].copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(
        X, y, discrete_features=discrete_features, random_state=0
    )
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


def drop_uninformative(df, mi_scores, limit=0.0):
    return df.loc[:, mi_scores > limit]

# Create final feature set


In [97]:
def create_features(df, df_test=None):
    X = df.copy()
    y = X.pop(target)
    #     y = df.loc[:, target]

    # Combine splits if test data is given
    #
    # If we're creating features for test set predictions, we should
    # use all the data we have available. After creating our features,
    # we'll recreate the splits.
    if df_test is not None:
        X_test = df_test.copy()
        X_test.pop(target)
        X = pd.concat([X, X_test])

    # Transformations
    X = X.join(break_down(X))
    X.pop("Name")
    X = X.join(mathematical_transforms(X))
    #     X = X.join(interactions(X))
    #     X = X.join(counts(X))
    #     X = X.join(group_transforms(X))

    # Mutual Information
    X.pop("PassengerId")
    mi_scores = make_mi_scores(X, y, df.index)
    X = drop_uninformative(X, mi_scores)

    # Clustering
    # X = X.join(cluster_labels(X, cluster_features, n_clusters=20))
    # X = X.join(cluster_distance(X, cluster_features, n_clusters=20))

    # PCA
    #     X = X.join(pca_inspired(X))
    # X = X.join(pca_components(X, pca_features))
    #     X = X.join(indicate_outliers(X))

    X = label_encode(X)

    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)

    # Target Encoder
    #     encoder = CrossFoldEncoder(MEstimateEncoder, m=1)
    #     X = X.join(encoder.fit_transform(X, y, cols=["MSSubClass"]))
    #     if df_test is not None:
    #         X_test = X_test.join(encoder.transform(X_test))

    if df_test is not None:
        return X, X_test
    else:
        return X

# Cross validation


In [134]:
from sklearn.model_selection import StratifiedKFold


def score_dataset(X, y, model=XGBClassifier()):
    score = []
    skf = StratifiedKFold(random_state=0, shuffle=True)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        score.append(model.score(X_test, y_test))

    return score

In [135]:
score_dataset(X_train, y_train)

[0.8021851638872916,
 0.7906843013225991,
 0.8016101207590569,
 0.8032220943613348,
 0.7940161104718066]

In [228]:
df_train, _ = load_data(PATH)
target = "Transported"
# df_train = impute(df_train)
# X_train = create_features(df_train)
# y_train = df_train.loc[:, target]
# X_train.head()
df_train.Cabin

0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8693, dtype: category
Categories (9826, object): ['A/0/P', 'A/0/S', 'A/1/P', 'A/1/S', ..., 'T/3/P', 'T/3/S', 'T/4/P', 'None']

In [153]:
df_train["VIP"].unique()

array([0, 1])

In [65]:
xgb_params = dict(
    max_depth=10,
    learning_rate=0.01,
    n_estimators=8000,
    min_child_weight=7,
    colsample_bytree=1,
    subsample=0.2,
    reg_alpha=5,
    reg_lambda=0,
)

xgb = XGBClassifier(**xgb_params)

np.mean(score_dataset(X_train, y_train, xgb))

0.7510722999276729

# 3- Train model and create submission


In [66]:
df_train, df_test = load_data(PATH)
X_train, X_test = create_features(df_train, df_test)
y_train = df_train.loc[:, target]

xgb = XGBClassifier(**xgb_params)
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test).astype("bool")

output = pd.DataFrame(
    {"PassengerId": df_test.PassengerId, "Transported": predictions.squeeze()}
)

output.to_csv("submission.csv", index=False)