In [1]:
"""
Referred to https://www.kaggle.com/code/alamebarham/spaceship-titanic-super-understandable-edition
https://www.kaggle.com/competitions/spaceship-titanic/discussion/585514 
Observations:
1. PassengerId -> group size
2. total spent == 0 -> CryptoSleep can be inferred to true

*** Only 2 is applied in this version for experimentation.

Version Info:

Logistic

Missing Value Imputation Strategies:
spending columns → 0
age -> median of age of VIP, non-VIP, and Unknown
categorical → 'Unknown'

New Features:
total spent > 0
age<14
Cabin -> Deck

Deleted Features:
Name
"""

"\nReferred to https://www.kaggle.com/code/alamebarham/spaceship-titanic-super-understandable-edition\nhttps://www.kaggle.com/competitions/spaceship-titanic/discussion/585514 \nObservations:\n1. PassengerId -> group size\n2. total spent == 0 -> CryptoSleep can be inferred to true\n\n*** Only 2 is applied in this version for experimentation.\n\nVersion Info:\n\nLogistic\n\nMissing Value Imputation Strategies:\nspending columns → 0\nage -> median of age of VIP, non-VIP, and Unknown\ncategorical → 'Unknown'\n\nNew Features:\ntotal spent > 0\nage<14\nCabin -> Deck\n\nDeleted Features:\nName\n"

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
import missingno as msno

In [3]:
## Load datasets

In [4]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [5]:
y_train = train["Transported"].astype(int)
X_train = train.drop(columns=["Transported"])
X_test = test.copy()

In [6]:
drop_cols = ['Name']
X_train = X_train.drop(columns=drop_cols)
X_test = X_test.drop(columns=drop_cols)

In [7]:
## Missing Value Imputation

In [8]:
# categorical → 'Unknown'
cat_cols = ['HomePlanet','CryoSleep','Destination','Cabin','VIP']
for c in cat_cols:
    X_train[c] = X_train[c].fillna('Unknown')
    X_test[c] = X_test[c].fillna('Unknown')

In [9]:
# spending columns -> 0
spend_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
X_train[spend_cols] = X_train[spend_cols].fillna(0)
X_test[spend_cols] = X_test[spend_cols].fillna(0)


In [10]:
# 2. total spent == 0 -> CryptoSleep can be inferred to true
total_spent_train = X_train[spend_cols].sum(axis=1)
mask_train = (X_train['CryoSleep'].isna() | (X_train['CryoSleep'] == 'Unknown')) & (total_spent_train == 0)
X_train.loc[mask_train, 'CryoSleep'] = True

total_spent_test = X_test[spend_cols].sum(axis=1)
mask_test = (X_test['CryoSleep'].isna() | (X_test['CryoSleep'] == 'Unknown')) & (total_spent_test == 0)
X_test.loc[mask_test, 'CryoSleep'] = True

In [11]:
# Age → median
med_age_all = X_train['Age'].median()
med_age_vip = X_train.loc[X_train['VIP'] == True, 'Age'].median()
med_age_non_vip = X_train.loc[X_train['VIP'] == False, 'Age'].median()
X_train.loc[X_train['VIP'] == True, 'Age'] = X_train.loc[X_train['VIP'] == True, 'Age'].fillna(med_age_vip)
X_train.loc[X_train['VIP'] == False, 'Age'] = X_train.loc[X_train['VIP'] == False, 'Age'].fillna(med_age_non_vip)
X_train.loc[X_train['VIP'] == 'Unknown', 'Age'] = X_train.loc[X_train['VIP'] == 'Unknown', 'Age'].fillna(med_age_all)

X_test.loc[X_test['VIP'] == True, 'Age'] = X_test.loc[X_test['VIP'] == True, 'Age'].fillna(med_age_vip)
X_test.loc[X_test['VIP'] == False, 'Age'] = X_test.loc[X_test['VIP'] == False, 'Age'].fillna(med_age_non_vip)
X_test.loc[X_test['VIP'] == 'Unknown', 'Age'] = X_test.loc[X_test['VIP'] == 'Unknown', 'Age'].fillna(med_age_all)

In [12]:
## New Features

In [13]:
# total spent > 0
spend_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

total_spent = X_train[spend_cols].sum(axis=1)
X_train['has_spent'] = (total_spent > 0).astype(int)

total_spent = X_test[spend_cols].sum(axis=1)
X_test['has_spent'] = (total_spent > 0).astype(int)

In [14]:
X_train = X_train.drop('RoomService', axis=1)
X_train = X_train.drop('FoodCourt', axis=1)
X_train = X_train.drop('ShoppingMall', axis=1)
X_train = X_train.drop('Spa', axis=1)
X_train = X_train.drop('VRDeck', axis=1)

X_test = X_test.drop('RoomService', axis=1)
X_test = X_test.drop('FoodCourt', axis=1)
X_test = X_test.drop('ShoppingMall', axis=1)
X_test = X_test.drop('Spa', axis=1)
X_test = X_test.drop('VRDeck', axis=1)

In [15]:
# age < 14
X_train['is_kid'] = (X_train['Age'] < 14).astype(int)

X_test['is_kid'] = (X_test['Age'] < 14).astype(int)

In [16]:
# Cabin -> Deck
X_train['Deck'] = X_train['Cabin'].str.split('/').str.get(0)
X_test['Deck'] = X_test['Cabin'].str.split('/').str.get(0)

X_train = X_train.drop('Cabin', axis=1)
X_test = X_test.drop('Cabin', axis=1)

In [17]:
pid_test = test["PassengerId"]       
X_train = X_train.drop(columns=["PassengerId"])
X_test  = X_test.drop(columns=["PassengerId"])

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# train / test alignment
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [18]:
X_train.columns

Index(['Age', 'has_spent', 'is_kid', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'HomePlanet_Unknown', 'CryoSleep_False',
       'CryoSleep_True', 'CryoSleep_Unknown', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'Destination_Unknown', 'VIP_False', 'VIP_True', 'VIP_Unknown', 'Deck_A',
       'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T',
       'Deck_Unknown'],
      dtype='object')

In [None]:
# Logistic: 0.7360552041403106

In [21]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

gb = GradientBoostingClassifier(
    n_estimators=200,     
    learning_rate=0.05,  
    max_depth=3,       
    subsample=0.8,    
    random_state=42
)

gb.fit(X_train, y_train)

val_pred = gb.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_pred))


Validation Accuracy: 0.7349051178838413


In [22]:
test_pred = gb.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": pid_test,
    "Transported": test_pred.astype(bool)
})

submission.to_csv("./submission/v4_gb.csv", index=False)


In [23]:
feat_imp = (
    pd.Series(gb.feature_importances_, index=X_train.columns)
      .sort_values(ascending=False)
)

print(feat_imp.head(15))


has_spent                    0.479136
CryoSleep_True               0.189962
HomePlanet_Earth             0.094855
Age                          0.073761
Deck_E                       0.046577
Deck_C                       0.017782
Deck_G                       0.017544
CryoSleep_False              0.012238
HomePlanet_Mars              0.011305
Destination_55 Cancri e      0.009240
HomePlanet_Europa            0.009188
Destination_TRAPPIST-1e      0.009138
HomePlanet_Unknown           0.005946
Deck_F                       0.004105
Destination_PSO J318.5-22    0.003397
dtype: float64
