In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from xgboost import XGBClassifier

# EDA

## Missingness

In [None]:
df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
missingness_rates = df.isna().sum() / len(df)
missingness_rates.drop(['PassengerId', 'Transported'], inplace=True)

fig, axs = plt.subplots(1, 2, figsize=(14, 4))

plt.figure(figsize = (10, 4))
missingness_rates.sort_values().plot(kind='bar', ax=axs[0])

axs[0].set(title='Missing rates', xlabel='Missingness rate', ylabel='%')

missing_indicators = df.isna().astype(int).drop(['PassengerId', 'Transported'], axis=1)
missing_corr = missing_indicators.corr()
im = axs[1].imshow(
    missing_corr,
    vmin=-1,
    vmax=1
)

axs[1].set_title("Missingness correlation")
axs[1].set_xticks(range(len(missing_corr)))
axs[1].set_yticks(range(len(missing_corr)))
axs[1].set_xticklabels(missing_corr.columns, rotation=90)
axs[1].set_yticklabels(missing_corr.columns)

fig.colorbar(
    im,
    ax=axs[1],
    fraction=0.046,
    pad=0.04,
    label="Correlation"
)

plt.tight_layout()
plt.show()

- Missignness is uniformally distributed
- No strong missigness correlation between features
- No strong missingness MAR between features

## Possible MAR between features

### CryoSleep
- Probably related to the missingness of RoomService, FoodCourt, ShoppingMall, Spa, VRDeck

In [None]:
features = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

rates = []

for f in features:
    rates.append({
        "feature": f,
        "CryoSleep=True": df.loc[df["CryoSleep"] == True, f].isna().mean(),
        "CryoSleep=False": df.loc[df["CryoSleep"] == False, f].isna().mean()
    })

rates_df = pd.DataFrame(rates).set_index("feature")

rates_df.plot(
    kind="bar",
    figsize=(8, 4)
)

plt.ylabel("Missingness rate")
plt.title("Missingness of spending features conditioned on CryoSleep")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

- Weak MAR for most spending features
- Stronger MAR for ShoppingMall

In [None]:
df.loc[df['CryoSleep'] == True]['ShoppingMall'].unique()

- If the passengers are in cryo sleep, their spending is exactly 0, so later imputation of 0 would make sense here

### Cabin
- Maybe related to the age of the passenger, as in children sharing the same cabin, no cabin yet assigned for specific home planets, idle in cryo sleep..

In [None]:
df.loc[df['Cabin'].isna()]

In [None]:
features = [
    "Age", "CryoSleep", "VIP", "RoomService",
    "FoodCourt", "ShoppingMall", "Spa", "VRDeck"
]

cabin_missing = df["Cabin"].isna()
cabin_present = ~cabin_missing

rates = []

for f in features:
    rates.append({
        "feature": f,
        "Cabin missing": df.loc[cabin_missing, f].isna().mean(),
        "Cabin present": df.loc[cabin_present, f].isna().mean()
    })

rates_df = pd.DataFrame(rates).set_index("feature")

ax = rates_df.plot(
    kind="bar",
    figsize=(9, 4),
    width=0.8
)

ax.set_ylabel("Missingness rate")
ax.set_title("Missingness of features conditioned on Cabin availability")
ax.tick_params(axis="x", rotation=45)
plt.tight_layout()
plt.show()

- Contextual MARs at CryoSleep and VIP, can flag as cabin_missing and keep NaN

# Feature engineering / missing flags

In [None]:
df.head()

In [None]:
df['cabin_missing'] = df['Cabin'].isna().astype(int) # Contextual MAR
mask = df['ShoppingMall'].isna() & (df['CryoSleep'] == True)

df.loc[mask, 'ShoppingMall'] = 0.0 # Always 0 in case CrypSleep is True

# We can split cabin into deck, num, side
df[['deck', 'num', 'side']] = df['Cabin'].str.split('/', expand=True)

# PassengerId can be split into its group and passenger number in that group
df[['group_id', 'passenger_num']] = df['PassengerId'].str.split('_', expand=True)

# Solo travelers are in non-repeating groups
df["group_size"] = df.groupby("group_id")["PassengerId"].transform("size")
df["group_size"] = df["group_size"].astype("Int64")
df["SoloTraveler"] = (df["group_size"] == 1).astype(int)

# Prepare test data, we perform the same feature engineering and flag operations here
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
mask = df_test['ShoppingMall'].isna() & (df_test['CryoSleep'] == True)
df_test.loc[mask, 'ShoppingMall'] = 0.0

df_test['cabin_missing'] = df_test['Cabin'].isna().astype(int)

df_test[['deck', 'num', 'side']] = df_test['Cabin'].str.split('/', expand=True)

df_test[['group_id', 'passenger_num']] = df_test['PassengerId'].str.split('_', expand=True)

df_test['group_size'] = df_test.groupby('group_id')['PassengerId'].transform('size')
df_test['group_size'] = df_test['group_size'].astype('Int64')

df_test['SoloTraveler'] = (df_test['group_size'] == 1).astype(int)

# Predictions

## Train, Validation splits

In [None]:
X = df.drop(columns=["Transported"])
y = df["Transported"].astype(int)

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(y_train.mean(), y_val.mean())
print(len(X_train), len(X_val))

## CatBoost

In [None]:
X = df.drop(columns=["Transported"])
y = df["Transported"].astype(int)

cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

def prep_cb(df_):
    df_ = df_.copy()
    for c in cat_cols:
        df_[c] = df_[c].astype("object").fillna("__MISSING__")
    return df_

X_train_cb = prep_cb(X_train)
X_val_cb   = prep_cb(X_val)

train_pool = Pool(X_train_cb, y_train, cat_features=cat_cols)
val_pool   = Pool(X_val_cb, y_val, cat_features=cat_cols)

param_distributions = {
    "learning_rate": [0.01, 0.02, 0.05, 0.1],
    "depth": [4, 5, 6, 7, 8],
    "l2_leaf_reg": [1, 3, 5, 8, 12, 20],
    "bagging_temperature": [0.0, 0.2, 0.5, 1.0, 2.0],
    "random_strength": [0.0, 0.5, 1.0, 2.0],
}

model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Logloss",
    iterations=5000,
    random_seed=42,
    od_type="Iter",
    od_wait=200,
    logging_level="Silent"
)

search_result = model.randomized_search(
    param_distributions=param_distributions,
    X=train_pool,
    n_iter=30,
    cv=5,
    partition_random_seed=42,
    stratified=True,
    shuffle=True,
    search_by_train_test_split=False,
    verbose=False
)

best_params = search_result["params"]
print("Best params:", best_params)
print("Best CV logloss:", min(search_result["cv_results"]["test-Logloss-mean"]))

In [None]:
final_model = CatBoostClassifier(
    **best_params,
    loss_function="Logloss",
    eval_metric="Logloss",
    iterations=5000,
    random_seed=42,
    od_type="Iter",
    od_wait=200,
    verbose=False
)

final_model.fit(train_pool)
preds = final_model.predict(val_pool)
val_probs = final_model.predict_proba(val_pool)[:, 1]
acc = (preds == y_val.values).mean()
print("VAL Logloss:", log_loss(y_val, val_probs))
print("VAL accuracy:", acc)

## XGBoost

In [None]:
cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = [c for c in X_train.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop"
)

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
    tree_method="hist",
)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", xgb),
])

param_distributions = {
    "model__learning_rate": [0.01, 0.02, 0.05, 0.1],
    "model__max_depth": [3, 4, 5, 6, 7, 8],
    "model__min_child_weight": [1, 2, 5, 10],
    "model__subsample": [0.6, 0.8, 1.0],
    "model__colsample_bytree": [0.6, 0.8, 1.0],
    "model__reg_lambda": [1, 3, 5, 10, 20],
    "model__reg_alpha": [0, 0.1, 0.5, 1.0],
    "model__n_estimators": [300, 600, 1000, 2000],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=30,
    scoring="neg_log_loss",
    cv=cv,
    random_state=42,
    n_jobs=-1,
)

search.fit(X_train, y_train)

print("Best params:", search.best_params_)
print("Best CV logloss:", -search.best_score_)

In [None]:
final_model_xgboost = search.best_estimator_
final_model_xgboost.fit(X_train, y_train)

val_probs = final_model_xgboost.predict_proba(X_val)[:, 1]
preds = (val_probs >= 0.5).astype(int)

acc = (preds == y_val.values).mean()
print("VAL Logloss:", log_loss(y_val, val_probs))
print("VAL accuracy:", acc)

# Results
- XGBoost: Slightly higher validation accuracy but also slightly higher logloss -> Competition evaluated on classification accuracy, therefore model choice
- CatBoost: Lower on both, but validation accuracy is more important

In [None]:
test_probs = final_model_xgboost.predict_proba(df_test)[:, 1]
test_preds = (test_probs >= 0.5)

submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Transported": test_preds
})

submission.to_csv("/kaggle/working/submission.csv", index=False)

In [None]:
submission.head()