In [None]:
"""
Result: 0.79565

Referred to https://www.kaggle.com/code/alamebarham/spaceship-titanic-super-understandable-edition
https://www.kaggle.com/competitions/spaceship-titanic/discussion/585514 
Observations:
1. PassengerId -> group size
2. total spent == 0 -> CryptoSleep can be inferred to true

*** Only 2 is applied in this version for experimentation.

Gradient Boosting

Missing Value Imputation Strategies:
spending columns → 0
age -> median
categorical → 'Unknown'

New Features:
None

Deleted Features:
Name
"""

"\nReferred to https://www.kaggle.com/code/alamebarham/spaceship-titanic-super-understandable-edition\nhttps://www.kaggle.com/competitions/spaceship-titanic/discussion/585514 \nObservations:\n1. PassengerId -> group size\n2. total spent == 0 -> CryptoSleep can be inferred to true\n\n*** Only 2 is applied in this version for experimentation.\n\nLogistic\n\nMissing Value Imputation Strategies:\nspending columns → 0\nage -> median\ncategorical → 'Unknown'\n\nNew Features:\nNone\n\nDeleted Features:\nName\n"

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
import missingno as msno

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

pid_test = test["PassengerId"]
y = train["Transported"].astype(int)
X_train = train.drop(columns=["Transported"])
X_test = test.copy()

In [4]:
# spending columns → 0
spend_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
X_train[spend_cols] = X_train[spend_cols].fillna(0)
X_test[spend_cols] = X_test[spend_cols].fillna(0)

# Age → median
X_train['Age'] = X_train['Age'].fillna(X_train['Age'].median())
X_test['Age'] = X_test['Age'].fillna(X_train['Age'].median())

# 2. total spent == 0 -> CryptoSleep can be inferred to true
total_spent_train = X_train[spend_cols].sum(axis=1)
mask_train = (X_train['CryoSleep'].isna() | (X_train['CryoSleep'] == 'Unknown')) & (total_spent_train == 0)
X_train.loc[mask_train, 'CryoSleep'] = True

total_spent_test = X_test[spend_cols].sum(axis=1)
mask_test = (X_test['CryoSleep'].isna() | (X_test['CryoSleep'] == 'Unknown')) & (total_spent_test == 0)
X_test.loc[mask_test, 'CryoSleep'] = True

# categorical → 'Unknown'
cat_cols = ['HomePlanet','CryoSleep','Destination','Cabin','VIP']
for c in cat_cols:
    X_train[c] = X_train[c].fillna('Unknown')
    X_test[c] = X_test[c].fillna('Unknown')

In [5]:
drop_cols = ['Name', 'Cabin']
X_train = X_train.drop(columns=drop_cols)
X_test = X_test.drop(columns=drop_cols)

In [6]:
pid_test = test["PassengerId"]       
X_train = X_train.drop(columns=["PassengerId"])
X_test  = X_test.drop(columns=["PassengerId"])

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# train / test 컬럼 정렬
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [7]:

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y, test_size=0.2, random_state=42
)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

gb = GradientBoostingClassifier(
    n_estimators=200,     
    learning_rate=0.05,  
    max_depth=3,       
    subsample=0.8,    
    random_state=42
)

gb.fit(X_train, y_train)

val_pred = gb.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_pred))


Validation Accuracy: 0.7814836112708453


In [9]:
test_pred = gb.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": pid_test,
    "Transported": test_pred.astype(bool)
})

submission.to_csv("./submission/v5_gb.csv", index=False)


In [10]:
feat_imp = (
    pd.Series(gb.feature_importances_, index=X_train.columns)
      .sort_values(ascending=False)
)

print(feat_imp.head(15))


CryoSleep_True             0.382856
Spa                        0.127988
VRDeck                     0.103887
RoomService                0.099392
FoodCourt                  0.091292
HomePlanet_Earth           0.068189
ShoppingMall               0.050503
Age                        0.039925
HomePlanet_Europa          0.018409
Destination_55 Cancri e    0.005116
Destination_TRAPPIST-1e    0.004205
HomePlanet_Mars            0.002564
HomePlanet_Unknown         0.002131
CryoSleep_False            0.001165
VIP_Unknown                0.000907
dtype: float64
