# Spaceship Titanic Dataset with XGBoost


# Import the packages


In [38]:
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from IPython.display import display
from pandas.api.types import CategoricalDtype

from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score

from xgboost import XGBClassifier

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings("ignore")

In [39]:
# PATH = "/kaggle/input/spaceship-titanic/"
PATH = ""

# 1 - Date preprocessing


In [40]:
def load_data(PATH):
    # Read data
    data_dir = Path(PATH)
    df_train = pd.read_csv(data_dir / "train.csv")
    id_train = df_train.index
    df_test = pd.read_csv(data_dir / "test.csv")
    id_test = df_test.index + len(id_train)
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test], ignore_index=True)
    # Cleaning
    df = clean(df)
    df = encode(df)
    df = impute(df)
    # Reform splits
    df_train = df.loc[id_train, :]
    df_test = df.loc[id_test, :]
    return df_train, df_test

## Clean data


In [41]:
def bool_to_int(x):
    if str(x) == "True":
        return 1
    elif str(x) == "False":
        return 0
    else:
        return x


def clean(df):
    df[["Transported", "CryoSleep", "VIP"]] = (
        df[["Transported", "CryoSleep", "VIP"]]
        .apply(lambda x: x.apply(bool_to_int))
        .astype("Int8")
    )
    return df

## Encode


In [42]:
def encode(df):
    features_nom = df.select_dtypes(exclude="number")

    for name in features_nom:
        df[name] = df[name].astype("category")

        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    return df

## Imput values


In [43]:
def impute(df):
    for name in df.select_dtypes("integer"):
        df[name] = df[name].fillna(0).astype("int")

    for name in df.select_dtypes("inexact"):
        df[name] = df[name].fillna(0).astype("float")

    for name in df.select_dtypes(exclude=["number"]):
        df[name] = df[name].fillna("None")

    return df

# 2 - Features engineering


In [59]:
def mathematical_transforms(df):
    X = pd.DataFrame()
    X["Overall_spent"] = df[
        ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    ].sum(axis=1)
    return X


def interactions(df):
    X = pd.DataFrame()
    return X


def counts(df):
    X = pd.DataFrame()
    return X


def break_down(df):
    X = pd.DataFrame()

    X[["Group", "Id"]] = df.PassengerId.str.split("_", expand=True)
    X["Group"] = X.Group.astype("int")
    X.pop("Id")

    X[["Deck", "Cabin_num", "Side"]] = df["Cabin"].str.split("/", expand=True)
    for name in ["Deck", "Side"]:
        X[name] = X[name].astype("category")

        if "None" not in X[name].cat.categories:
            X[name] = X[name].cat.add_categories("None")
    X[["Deck", "Side"]] = X[["Deck", "Side"]].fillna("None")

    X["Cabin_num"] = X.Cabin_num.fillna(0).astype("int")

    return X


def group_transform(df):
    X = pd.DataFrame()

In [60]:
def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    return X

# Mutual information


In [61]:
from sklearn.feature_selection import mutual_info_regression


def make_mi_scores(X, y, index):
    X = X.loc[index, :].copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(
        X, y, discrete_features=discrete_features, random_state=0
    )
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


def drop_uninformative(df, mi_scores, limit=0.0):
    return df.loc[:, mi_scores > limit]

# Create final feature set


In [62]:
def create_features(df, df_test=None):
    X = df.copy()
    y = X.pop(target)
    #     y = df.loc[:, target]

    # Combine splits if test data is given
    #
    # If we're creating features for test set predictions, we should
    # use all the data we have available. After creating our features,
    # we'll recreate the splits.
    if df_test is not None:
        X_test = df_test.copy()
        X_test.pop(target)
        X = pd.concat([X, X_test])

    # Transformations
    X = X.join(break_down(X))
    X = X.join(mathematical_transforms(X))
    #     X = X.join(interactions(X))
    #     X = X.join(counts(X))
    #     X = X.join(group_transforms(X))

    # Mutual Information
    X.pop("PassengerId")
    mi_scores = make_mi_scores(X, y, df.index)
    X = drop_uninformative(X, mi_scores)

    # Clustering
    # X = X.join(cluster_labels(X, cluster_features, n_clusters=20))
    # X = X.join(cluster_distance(X, cluster_features, n_clusters=20))

    # PCA
    #     X = X.join(pca_inspired(X))
    # X = X.join(pca_components(X, pca_features))
    #     X = X.join(indicate_outliers(X))

    X = label_encode(X)

    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)

    # Target Encoder
    #     encoder = CrossFoldEncoder(MEstimateEncoder, m=1)
    #     X = X.join(encoder.fit_transform(X, y, cols=["MSSubClass"]))
    #     if df_test is not None:
    #         X_test = X_test.join(encoder.transform(X_test))

    if df_test is not None:
        return X, X_test
    else:
        return X

# Cross validation


In [63]:
def score_dataset(X, y, model=XGBClassifier()):
    score = cross_val_score(model, X, y, cv=5, scoring="accuracy")
    return score

In [64]:
df_train, _ = load_data(PATH)
target = "Transported"
X_train = create_features(df_train)
y_train = df_train.loc[:, target]
X_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Group,Deck,Cabin_num,Side,Overall_spent
0,1,0,208,2,39.0,0,0.0,0.0,0.0,0.0,0.0,7819,1,1,0,0,0.0
1,0,0,3241,2,24.0,0,109.0,9.0,25.0,549.0,44.0,6688,2,5,0,1,736.0
2,1,0,1,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,669,3,0,0,1,10383.0
3,1,0,1,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,10688,3,0,0,1,5176.0
4,0,0,3243,2,16.0,0,303.0,70.0,151.0,565.0,2.0,12400,4,5,1,1,1091.0


In [70]:
X_train.Overall_spent.mean()

1440.8663292304152

In [71]:
X_train.groupby(by=["VIP"]).mean()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Group,Deck,Cabin_num,Side,Overall_spent
VIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0.704026,0.355074,4901.123146,1.521074,28.02284,214.290794,417.781728,167.887097,294.081822,276.900518,6463.101719,4627.826701,4.415352,594.238404,0.539557,1370.941959
1,1.366834,0.105528,1896.296482,1.266332,37.261307,464.095477,1756.778894,241.502513,753.065327,1210.035176,6456.854271,4870.834171,2.331658,261.61809,0.517588,4425.477387


In [65]:
xgb_params = dict(
    max_depth=10,
    learning_rate=0.01,
    n_estimators=8000,
    min_child_weight=7,
    colsample_bytree=1,
    subsample=0.2,
    reg_alpha=5,
    reg_lambda=0,
)

xgb = XGBClassifier(**xgb_params)

np.mean(score_dataset(X_train, y_train, xgb))

0.7510722999276729

# 3- Train model and create submission


In [66]:
df_train, df_test = load_data(PATH)
X_train, X_test = create_features(df_train, df_test)
y_train = df_train.loc[:, target]

xgb = XGBClassifier(**xgb_params)
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test).astype("bool")

output = pd.DataFrame(
    {"PassengerId": df_test.PassengerId, "Transported": predictions.squeeze()}
)

output.to_csv("submission.csv", index=False)