In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import joblib
import tensorflow as tf
from tensorflow import keras
import torch
import torch.nn as nn


In [2]:
df = pd.read_csv('spaceship_titanic/train.csv')  # Replace with your actual file path if different


In [3]:
df.head(30)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


In [4]:
def preprocess_spaceship_data(df):
    import pandas as pd

    df = df.copy()

    # Parse PassengerId
    if 'PassengerId' in df.columns:
        df['PassengerId'] = pd.to_numeric(df['PassengerId'].astype(str).str.extract(r'(\d+)')[0], errors='coerce')

    # Split Cabin into Deck, CabinNum, Side
    if 'Cabin' in df.columns:
        df['Cabin'] = df['Cabin'].astype(str)
        cabin_parts = df['Cabin'].str.split('/', expand=True)
        if cabin_parts.shape[1] == 3:
            df['Deck'] = cabin_parts[0]
            df['CabinNum'] = pd.to_numeric(cabin_parts[1], errors='coerce')
            df['Side'] = cabin_parts[2]
        df.drop(columns=['Cabin'], inplace=True)

    # Convert CryoSleep and VIP to bool, safely handle missing mode
    for col in ['CryoSleep', 'VIP']:
        if col in df.columns:
            df[col] = df[col].map({'True': True, 'False': False, True: True, False: False})
            mode_series = df[col].mode()
            fallback = False
            df[col] = df[col].fillna(mode_series[0] if not mode_series.empty else fallback)
            df[col] = df[col].astype(bool)

    # Handle Side (object column)
    if 'Side' in df.columns:
        mode_series = df['Side'].mode()
        df['Side'] = df['Side'].fillna(mode_series[0] if not mode_series.empty else 'S')

    # Fill CabinNum using median per Deck if possible
    if 'CabinNum' in df.columns and 'Deck' in df.columns:
        df['CabinNum'] = df.groupby('Deck')['CabinNum'].transform(lambda x: x.fillna(x.median()))
        df['CabinNum'] = df['CabinNum'].fillna(df['CabinNum'].median())

    # One-hot encode Deck and Side
    for col in ['Deck', 'Side']:
        if col in df.columns:
            df = pd.get_dummies(df, columns=[col], drop_first=True)

    # Drop unnecessary columns
    columns_to_drop = ['HomePlanet', 'Destination', 'Name', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa','VRDeck','PassengerId','CabinNum']
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

    return df


In [16]:
train_df=preprocess_spaceship_data(df)


In [18]:
train_df.dtypes

CryoSleep      bool
VIP            bool
Transported    bool
Deck_B         bool
Deck_C         bool
Deck_D         bool
Deck_E         bool
Deck_F         bool
Deck_G         bool
Deck_T         bool
Deck_nan       bool
Side_S         bool
dtype: object

In [20]:
X = train_df.drop(columns=['Transported'])
y = train_df['Transported']

# Double-check
print(X.dtypes.value_counts())


bool    11
Name: count, dtype: int64


In [24]:
X = X.astype(int)
y = y.astype(int)


In [31]:
from xgboost import XGBClassifier

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X, y)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [39]:
import pandas as pd
from xgboost import XGBClassifier
import joblib

# Load raw test data
test_df_raw = pd.read_csv("spaceship_titanic/test.csv")

# Reuse preprocessing function
def preprocess_spaceship_data(df):
    df = df.copy()
    
    # Fill missing values
    df['HomePlanet'].fillna('unknown', inplace=True)
    df['CryoSleep'].fillna(False, inplace=True)
    df['Cabin'].fillna('unknown/unknown/unknown', inplace=True)
    df['Destination'].fillna('unknown', inplace=True)
    df['VIP'].fillna(False, inplace=True)
    df['Name'].fillna('unknown', inplace=True)

    # Extract Deck and Side
    df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0])
    df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[-1])
    
    # One-hot encoding
    deck_dummies = pd.get_dummies(df['Deck'], prefix='Deck')
    side_dummies = pd.get_dummies(df['Side'], prefix='Side')

    # Combine features
    df = pd.concat([df[['PassengerId', 'CryoSleep', 'VIP']], deck_dummies, side_dummies], axis=1)

    # Add missing columns
    expected_columns = [
        'CryoSleep', 'VIP',
        'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_nan',
        'Side_S'
    ]
    for col in expected_columns:
        if col not in df.columns:
            df[col] = False
    
    return df.set_index('PassengerId')[expected_columns]

# Preprocess test data
X_test = preprocess_spaceship_data(test_df_raw)

# Predict with your trained model
# If not loaded, load with: model = joblib.load('model.pkl')
y_pred = model.predict(X_test)

# Convert predictions to True/False
y_pred_bool = y_pred.astype(bool)

# Create submission
submission_df = pd.DataFrame({
    'PassengerId': X_test.index,
    'Transported': y_pred_bool
})

# Save to CSV
submission_df.to_csv("submission.csv", index=False)
print("✅ Saved predictions with True/False to 'submission.csv'")


✅ Saved predictions with True/False to 'submission.csv'


In [41]:
import joblib
joblib.dump(model, "model.pkl")
# And load it with:
# model = joblib.load("model.pkl")


['model.pkl']