# Overview

Here we are using emsemble models to help us predict who out of a list of passengers would be "Transported".

Thank you to Nikhil R for his workbook, [here](https://www.kaggle.com/code/nikhilramlukan/spaceship-titanic/notebook). Which was the inspiration for my approach below.

I took it one step further by doing Hyperparameter Tuning, Kfold cross validation. And having 2 models at the second layer to work with the predictions of the base layer models.

In [1]:
import numpy as np 
import pandas as pd 
import os

from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV
from sklearn import preprocessing

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import matplotlib.pyplot as plt
import seaborn as sns

# Load Data

In [2]:
def load_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train_res = train['Transported'].astype(int)
    test_id = test['PassengerId']
    return train, test, train_res, test_id

# Preprocess Data

In [3]:
def preprocess_data(train, test):
    mappings = {"Europa": 1, "Earth": 2, "Mars": 3}
    train['HomePlanet'] = train['HomePlanet'].map(mappings).fillna(0)
    test['HomePlanet'] = test['HomePlanet'].map(mappings).fillna(0)
    
    train[['Cabin_1', 'Cabin_2', 'Cabin_3']] = train['Cabin'].str.split('/', expand=True)
    test[['Cabin_1', 'Cabin_2', 'Cabin_3']] = test['Cabin'].str.split('/', expand=True)
    
    cabin_1_mapping = {"B": 1, "F": 2, "A": 3, "G": 4, "E": 5, "D": 6, "C": 7, "T": 8}
    train['Cabin_1'] = train['Cabin_1'].map(cabin_1_mapping).fillna(0)
    test['Cabin_1'] = test['Cabin_1'].map(cabin_1_mapping).fillna(0)
    
    cabin_3_mapping = {"P": 1, "S": 2}
    train['Cabin_3'] = train['Cabin_3'].map(cabin_3_mapping).fillna(0)
    test['Cabin_3'] = test['Cabin_3'].map(cabin_3_mapping).fillna(0)
    
    cols = ['FoodCourt', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in cols:
        train[col] = train[col].fillna(train[col].median())
        test[col] = test[col].fillna(test[col].median())

    # Convert categorical columns to 'category' dtype
    categorical_columns = ['CryoSleep', 'VIP']
    for col in categorical_columns:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

        # Add 'Missing' category if it's not already present
        if 'Missing' not in train[col].cat.categories:
            train[col] = train[col].cat.add_categories('Missing')
        if 'Missing' not in test[col].cat.categories:
            test[col] = test[col].cat.add_categories('Missing')

        # Now, fill NaN values with 'Missing'
        train[col] = train[col].fillna('Missing')
        test[col] = test[col].fillna('Missing')

    # Advanced Feature Engineering
    train['FamilySize'] = train['PassengerId'].apply(lambda x: int(x.split('_')[0]))
    test['FamilySize'] = test['PassengerId'].apply(lambda x: int(x.split('_')[0]))
    
    train['Spending'] = train[cols].sum(axis=1)
    test['Spending'] = test[cols].sum(axis=1)
    
    train['MissingValues'] = train.isnull().sum(axis=1)
    test['MissingValues'] = test.isnull().sum(axis=1)
    
    train.drop(['Name', 'Transported', 'PassengerId', 'Cabin', 'Cabin_2', 'Destination'], axis=1, inplace=True)
    test.drop(['Name', 'PassengerId', 'Cabin', 'Cabin_2', 'Destination'], axis=1, inplace=True)
    
    return train, test

# Hyperparameter Tuning

In [4]:
def tune_xgb(train, train_res):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1],
        'colsample_bytree': [0.8, 1]
    }
    xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=42, enable_categorical=True)
    grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(train, train_res)
    print("Best parameters for XGBoost:", grid_search.best_params_)
    return grid_search.best_estimator_

# Train Base Models

In [5]:
def train_base_models(train, train_res):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    models = {
        "XGB": tune_xgb(train, train_res),
        "LGB": LGBMClassifier(random_state=42),
        "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
    }
    oof_preds = np.zeros((train.shape[0], len(models)))
    
    # List of categorical columns
    cat_features = ['CryoSleep', 'VIP']

    for fold, (train_idx, val_idx) in enumerate(kf.split(train, train_res)):
        X_train, X_val = train.iloc[train_idx], train.iloc[val_idx]
        y_train, y_val = train_res.iloc[train_idx], train_res.iloc[val_idx]
        
        for i, (name, model) in enumerate(models.items()):
            if name == "CatBoost":
                model.fit(X_train, y_train, cat_features=cat_features)
            else:
                model.fit(X_train, y_train)
            
            oof_preds[val_idx, i] = model.predict_proba(X_val)[:, 1]
    
    return models, oof_preds

# Train Meta Models

In [6]:
def train_meta_models(oof_preds, train_res):
    oof_preds_df = pd.DataFrame(oof_preds, columns=["XGB", "LGB", "CatBoost"])
    meta_models = {
        "LogisticRegression": LogisticRegression(),
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
    }
    
    meta_model_scores = {}
    for name, model in meta_models.items():
        scores = cross_val_score(model, oof_preds_df, train_res, cv=5, scoring='accuracy')
        meta_model_scores[name] = scores.mean()
        print(f"{name} CV Accuracy: {scores.mean():.4f}")
    
    best_meta_model = max(meta_model_scores, key=meta_model_scores.get)
    meta_models[best_meta_model].fit(oof_preds_df, train_res)
    
    return meta_models, best_meta_model

# Generate Submission File

In [7]:
def predict(models, meta_models, best_meta_model, test, test_id):
    test_preds = np.column_stack([
        models[name].predict_proba(test)[:, 1] for name in models.keys()
    ])
    final_preds = meta_models[best_meta_model].predict(test_preds)
    
    submission = pd.DataFrame({'PassengerId': test_id, 'Transported': final_preds.astype(bool)})
    submission.to_csv('submission.csv', index=False)
    print("Submission file saved!")

# Main Execution

In [8]:
train, test, train_res, test_id = load_data("/kaggle/input/spaceship-titanic/train.csv", "/kaggle/input/spaceship-titanic/test.csv")
train, test = preprocess_data(train, test)
models, oof_preds = train_base_models(train, train_res)
meta_models, best_meta_model = train_meta_models(oof_preds, train_res)
predict(models, meta_models, best_meta_model, test, test_id)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters for XGBoost: {'colsample_bytree': 1, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003996 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1893
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001117 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

