# SpaceShip Titanic

## Imports

In [23]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor, RandomForestRegressor

## Functions

In [24]:
time_start = time.time()

def fit_and_predict(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    return model.predict(X_train), model.predict(X_test)

def create_model(X_train, features, model, ordinal=False):
    numeric_features = []
    categorical_features = []
    for feature in features:
        if X_train[feature].dtypes in ['int64', 'float64']:
            numeric_features.append(feature)
        else:
            categorical_features.append(feature)
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    if ordinal:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ])
    else:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
            
        ])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )
    return Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

def print_table(table: list):

    display_size = 50

    col_size = []
    to_verif_index = []
    tab_size = 0
    for i in range(len(table[0])):
        col_size.append(0)
        for j in range(len(table)):
            col_size[i] = max(len(str(table[j][i])), col_size[i])
        if (col_size[i] + 6) > (display_size/(len(table[0]))):
            to_verif_index.append(i)
        else:
            tab_size += col_size[i]
    display_size -= tab_size + 4 * len(col_size)
    nb = len(to_verif_index)
    for index in to_verif_index:
        col_size[index] = min(col_size[index], int(display_size/nb))
        tab_size += col_size[index]

    buffer = ""
    line = "╞"
    first_line = "╭"
    end_line = "╰"

    for i in range(len(table[0])):
        elt = table[0][i]
        if len(str(elt)) > col_size[i]:
            elt = elt[0:(col_size[i]-1)] + "…"
        buffer += ("│ {:^" + str(col_size[i]) + "} ").format(elt)
        line += ("═{:^" + str(col_size[i]) + "}═╪").format(col_size[i]*'═')
        first_line += ("─{:^" + str(col_size[i]) + "}─┬").format(col_size[i]*'─')
        end_line += ("─{:^" + str(col_size[i]) + "}─┴").format(col_size[i]*'─')
    buffer += "│"
    line = line[:-1] + "╡"
    first_line = first_line[:-1] + "╮"
    end_line = end_line[:-1] + "╯"

    print(first_line)
    print(buffer)
    print(line)
    buffer = ""

    table = table[1:]
    for i in range(len(table)):
        for j in range(len(table[i])):
            elt = str(table[i][j])
            if len(elt) > col_size[j]:
                elt = elt[:(col_size[j]-1)] + "…"
            buffer += ("│ {:" + str(col_size[j]) + "} ").format(str(elt))
        buffer += "│\n"
    if len(buffer) > 0:
        print(buffer[:-1])
    print(end_line)

## Parameters

In [None]:
## Paths
data_folder = 'resources/data/'
output_folder = 'output/'
plots_folder = 'output/plots/'

## Multithreading parameters
num_threads = os.cpu_count()

## Target
target = 'Transported'

## Parameters of output
predict_mode = True # If True, the output is the submission file for Kaggle competition
plot_activated = False
boxenplot_activated = False
save_plots = False # If True, the plots are saved in the plots folder

## Parameters of the data
poly_features_activated = False
outliers_removal = True 
random_split = 5
trunc = 0.8

#Parameters of features selection
dynamic_features = False
max_features = 15

#Parameters of the models
dynamic_hyperparameters = True

## Train data loading

In [26]:
df = pd.read_csv(data_folder + 'train.csv')
df[df.select_dtypes(exclude='number').columns] = df.select_dtypes(exclude='number').astype('category')

print(df.shape)
print(df.dtypes)
print('-------')
features = ['HomePlanet','CryoSleep','Cabin','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Name','Transported']
print(df.isna().sum())


(8693, 14)
PassengerId     category
HomePlanet      category
CryoSleep       category
Cabin           category
Destination     category
Age              float64
VIP             category
RoomService      float64
FoodCourt        float64
ShoppingMall     float64
Spa              float64
VRDeck           float64
Name            category
Transported     category
dtype: object
-------
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


# Missing values

In [27]:
df_filled = df.copy()
    
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        mediane = df[col].median()
        df_filled[col].fillna(mediane, inplace=True)
    else:
        mode = df[col].mode()[0]
        df_filled[col].fillna(mode, inplace=True)

print(df_filled.isna().sum())

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filled[col].fillna(mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filled[col].fillna(mediane, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

## Test data loading

In [28]:
if predict_mode:
    # Load the test features and drop the column ID, which is only useful for the submission
    X_test = pd.read_csv(data_folder + 'test.csv')
    X_test_id = X_test['PassengerId']
    X_test = X_test.drop(columns='PassengerId')
    X_train = df_filled.drop(columns=target)
    y_train = df_filled[target]

## Test data splitting

In [29]:
# Load the training dataset and define the features and target
if not predict_mode:
    X_train, X_test, y_train, y_test = train_test_split(df_filled.drop(target, axis=1), df_filled[target], train_size=trunc, test_size=1-trunc, shuffle=True, random_state=random_split)


## Feature selection

In [37]:
'''
if outliers_removal:
    df = pd.concat([X_train, y_train], axis=1)
    df = df[df[target] < 80000]
    X_train = df.drop(target, axis=1)
    y_train = df[target]
'''
 
# Dynamic selection of features
def select_features(df_filled, target):
    model = create_model(df_filled, df_filled.drop(target, axis=1).columns.to_list(), model=RandomForestRegressor(n_estimators=500, n_jobs=num_threads), ordinal=True)
    model.fit(X_train, y_train)
    feature_importances = model.named_steps['model'].feature_importances_
    feature_importances = pd.DataFrame({
        'feature': X_train.columns.tolist(),
        'importance': feature_importances
    })
    feature_importances = feature_importances.sort_values(by='importance', ascending=False)
    selected_features = feature_importances['feature'].head(n=max_features).tolist()
    return selected_features

# 'HomePlanet','CryoSleep','Cabin','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Name','Transported'
if dynamic_features:
    selected_features = select_features(df_filled, target)
else:
    # Best features after lot of tests (boxenplots, etc.)
    selected_features = ['HomePlanet','CryoSleep','Cabin','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Name']

## BoxenPlot

In [38]:
if plot_activated and boxenplot_activated:
    num_quantiles = 20
    for column in df_filled.columns.tolist():
        if column in selected_features:
            plt.figure(figsize=(20, 12))
            if df_filled[column].dtype in ['int64', 'float64']:
                categories = pd.cut(df_filled[column], bins=num_quantiles)
                sns.boxenplot(x=categories, y=df_filled[target], color='blue')
            else:
                sns.boxenplot(x=X_train[column], y=df_filled[target], color='blue')
            plt.title(f"Boxenplot of {column}")
            if save_plots:
                plt.savefig(f"{plots_folder}/boxenplot_{column}.png")
            else:
                plt.show()
            plt.clf()

## Preprocessing

In [40]:
def get_params(X_train, y_train, model, grid_params):
    estimator = create_model(X_train, selected_features, RandomizedSearchCV(model, grid_params, cv=5, n_jobs=num_threads, n_iter=200, verbose=1, scoring='neg_mean_squared_error'))
    estimator.fit(X_train, y_train)
    print(estimator.get_params()['model'].best_params_)
    return estimator.get_params()['model'].best_params_


if dynamic_hyperparameters:
    lasso_grid_params = {
        'eps': [0.0001, 0.0005, 0.001, 0.005, 0.01],  # Valeurs de l'hyperparamètre epsilon
        'n_alphas': [10, 100, 200, 300, 400, 500],  # Nombre de valeurs d'alpha à tester
        'cv': [3, 4, 5, 6, 7, 8, 9, 10],  # Nombre de folds pour la validation croisée
    }

    gb_grid_params =  {
        'n_estimators': [150, 200, 250, 300, 350, 400, 500],  # Nombre d'arbres
        'learning_rate': [0.01, 0.05, 0.1, 0.02, 0.03, 0.4, 0.06, 0.07],  # Taux d'apprentissage
        'max_depth': [6, 7, 8,  9, 10, 11, 12, 13, None],  # Profondeur maximale des arbres
        'max_features': ['sqrt'],  # Nombre de features à considérer pour chaque split
        'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17,29],  # Nombre minimum d'échantillons pour effectuer un split
        'min_samples_leaf': [1, 2, 3, 4, 5, 6, 8, 10, 12, 14, 16, 18, 20],  # Nombre minimum d'échantillons dans une feuille
        'subsample': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1],  # Pourcentage d'échantillons à utiliser pour chaque arbre
        'random_state': [1, 2, 3, 4, 5, 42]
    }
    gb_params = get_params(X_train, y_train, GradientBoostingRegressor(), gb_grid_params)
    lasso_params = get_params(X_train, y_train, LassoCV(), lasso_grid_params)
else:
    lasso_params = {'n_alphas': 300, 'eps': 0.0001, 'cv': 10}
    gb_params = {'subsample': 0.95, 'random_state': 3, 'n_estimators': 150, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'max_depth': 11, 'learning_rate': 0.05}

estimators = [
    ('lasso', create_model(X_train, selected_features, LassoCV(**lasso_params))),
    ('boosting', create_model(X_train, selected_features, GradientBoostingRegressor(**gb_params))),
]
model = StackingRegressor(estimators=estimators, final_estimator=RidgeCV(cv=5), n_jobs=num_threads)

KeyError: 'HomePlanet'

## Fitting and prediction

In [36]:
print(X_train.dtypes)

y_train_pred, y_pred = fit_and_predict(model, X_train, y_train, X_test)

Age                       float64
RoomService               float64
FoodCourt                 float64
ShoppingMall              float64
Spa                       float64
                           ...   
Name_Zosmark Unaasor         bool
Name_Zosmas Ineedeve         bool
Name_Zosmas Mormonized       bool
Name_Zubeneb Flesping        bool
Name_Zubeneb Pasharne        bool
Length: 23735, dtype: object


ValueError: A given column is not a column of the dataframe

## Submission

In [34]:
if predict_mode:
    submission = pd.DataFrame({
        'ID': X_test_id,
        target: y_pred,
    })
    submission.to_csv(output_folder+'submission.csv', index=False)
    print(submission)

NameError: name 'y_pred' is not defined