# ðŸš€ Spaceship Titanic: Optimized Classification Pipeline

## Project Goal
Predict which passengers were transported by the anomaly using various Machine Learning models.

## Methodology
1. **Feature Engineering**: Handling missing values, splitting 'Cabin' feature, creating financial features.
2. **Preprocessing**: Using Scikit-Learn Pipelines for scaling and encoding.
3. **Model Selection**: Comparing 6 different models (Linear, Tree-based, Boosting).
4. **Hyperparameter Tuning**: Using GridSearchCV to find optimal parameters.

## 1. Environment Setup and Library Imports

In [157]:
import pandas as pd
import numpy as np
import time
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

## 2. Data Loading

In [158]:
# Load Data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

# --- Feature Engineering Functions ---
def process_data(df):
    # 1. Create TotalSpend feature
    spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['TotalSpend'] = df[spend_cols].fillna(0).sum(axis=1)

    # 2. Split Cabin into Deck/Side
    df['Cabin'] = df['Cabin'].fillna('Z/9999/Z')
    df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0])
    df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2])

    # Drop original Cabin column to prevent high cardinality issues
    df = df.drop('Cabin', axis=1)

    return df

# Apply transformations
train_df = process_data(train_df)
test_df = process_data(test_df)

# Define Target and Features
X = train_df.drop(["Transported", "PassengerId", "Name"], axis=1)
y = train_df["Transported"]

# Identify columns types
num_attribs = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend"]
cat_attribs = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]

print("Data processed successfully.")
X.head()

Data processed successfully.


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpend,Deck,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0.0,B,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,736.0,F,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,10383.0,A,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,5176.0,A,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1091.0,F,S


## 3. Preprocessing Pipeline

In [159]:
# 1. Numerical Pipeline: Impute median -> Scale
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# 2. Categorical Pipeline: Impute most frequent -> OneHotEncode
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 3. Combine both
preprocess_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])

# Split data for validation (Optional, as GridSearch handles CV, but good for final check)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and Transform Training Data
X_train_processed = preprocess_pipeline.fit_transform(X_train)
X_val_processed = preprocess_pipeline.transform(X_val)

print(f"Training shape: {X_train_processed.shape}")

Training shape: (6954, 29)


## 4. Model Setup & Grids

In [160]:
# Initialize Classifiers
classifiers = {
    "LogisticRegression" : LogisticRegression(random_state=42, solver='liblinear'),
    "KNN" : KNeighborsClassifier(),
    "SVC" : SVC(random_state=42, probability=True),
    "RandomForest" : RandomForestClassifier(random_state=42),
    "GradientBoosting" : HistGradientBoostingClassifier(random_state=42),
    "NaiveBayes": GaussianNB()
}

# Define Hyperparameter Grids
grid = {
    "LogisticRegression" : {'penalty': ['l1','l2'], 'C': [0.5, 1, 1.5], 'max_iter': [100]},
    "KNN" : {'n_neighbors': [5, 7, 9], 'p': [1, 2]},
    "SVC" : {'C': [0.5, 1, 1.5], 'kernel': ['rbf']}, # Simplified for speed
    "RandomForest" : {'n_estimators': [100, 200], 'max_depth': [8, 10, 12]},
    "GradientBoosting" : {'learning_rate': [0.05, 0.1], 'max_depth': [None, 10], 'max_iter': [100, 200]},
    "NaiveBayes": {'var_smoothing': [1e-9, 1e-8]}
}

## 5. Training & Evaluation Loop

In [163]:
results_table = []

for model_name, model in classifiers.items():
    if model_name in grid:
        start_time = time.time()

        # GridSearch
        gs = GridSearchCV(model, grid[model_name], cv=5, scoring='accuracy', n_jobs=-1)
        gs.fit(X_train_processed, y_train)

        elapsed = time.time() - start_time

        results_table.append({
            'Model': model_name,
            'Best Score': gs.best_score_,
            'Best Params': gs.best_params_,
            'Time (s)': round(elapsed, 2)
        })
        print(f"{model_name}: {round(gs.best_score_, 4)} | Time: {round(elapsed, 2)}s")

# Display Results
results_df = pd.DataFrame(results_table).sort_values(by='Best Score', ascending=False)
results_df

LogisticRegression: 0.7934 | Time: 4.49s
KNN: 0.7909 | Time: 3.4s
SVC: 0.8033 | Time: 86.65s
RandomForest: 0.8056 | Time: 19.59s
GradientBoosting: 0.8132 | Time: 19.86s
NaiveBayes: 0.7182 | Time: 0.07s


Unnamed: 0,Model,Best Score,Best Params,Time (s)
4,GradientBoosting,0.813202,"{'learning_rate': 0.1, 'max_depth': 10, 'max_i...",19.86
3,RandomForest,0.80558,"{'max_depth': 10, 'n_estimators': 200}",19.59
2,SVC,0.80328,"{'C': 1.5, 'kernel': 'rbf'}",86.65
0,LogisticRegression,0.793358,"{'C': 0.5, 'max_iter': 100, 'penalty': 'l1'}",4.49
1,KNN,0.790912,"{'n_neighbors': 9, 'p': 2}",3.4
5,NaiveBayes,0.718154,{'var_smoothing': 1e-08},0.07


## 6. Final Prediction & Submission

In [171]:
# Retrain the Best Model
best_model = HistGradientBoostingClassifier(
    learning_rate=0.1,
    max_depth=10,
    max_iter=100,
    random_state=42
)
best_model.fit(X_train_processed, y_train)

# Load and Preprocess Test Data
test_df_raw = pd.read_csv('test.csv')
test_df_eng = process_data(test_df_raw)  # Apply Feature Engineering
X_test_processed = preprocess_pipeline.transform(test_df_eng) # Apply Scaling/Encoding

# Predict and Create Submission
final_predictions = best_model.predict(X_test_processed)

submission = pd.DataFrame({
    'PassengerId': test_df_raw['PassengerId'],
    'Transported': final_predictions
})

submission.to_csv('submission.csv', index=False)
print("Submission saved successfully.")
submission.head()

Submission saved successfully.


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [172]:
# # Submission
# !kaggle competitions submit -c spaceship-titanic -f submission.csv -m "Spaceship Titanic"

100% 56.3k/56.3k [00:00<00:00, 94.9kB/s]
Successfully submitted to Spaceship Titanic