Spaceship Titanic
=================

https://www.kaggle.com/competitions/spaceship-titanic/code?competitionId=34377

In [1]:
import pandas as pd
import joblib

import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import Spaceship_Titanic_data

(train) Number of rows = 8693 and Number of cols = 14
(test) Number of rows = 4277 and Number of cols = 13


Create and train a model
------------------------

Create a model with the processing pipeline and one classifier.

In [2]:
model = Pipeline(
    [
        ('preproc', Spaceship_Titanic_data.preproc),
        ('drop_target', Spaceship_Titanic_data.drop_target),
        ('cla', LogisticRegression())
    ]
).set_output(transform='pandas')

Create a `GridSearchCV` to try many variants of the model, with different strategies and parameters, and find the combination with the best score.

In [3]:
param_grid = [
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (LogisticRegression(),),
        'cla__C': [0.5, 1.0, 5.0],
        'cla__max_iter': [1000],
        'cla__class_weight': [None, 'balanced']
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (KNeighborsClassifier(),),
        'cla__n_neighbors': [3, 5, 7],
        'cla__weights': ['uniform', 'distance']
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (MLPClassifier(),),
        'cla__hidden_layer_sizes': [(20,), (25,), (30,)],
        'cla__activation': ['logistic', 'relu'],
        'cla__max_iter': [1500]
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (DecisionTreeClassifier(),),
        'cla__criterion': ['gini', 'entropy'],
        'cla__max_depth': [5, 8, 10]
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (RandomForestClassifier(),),
        'cla__n_estimators': [50, 100, 150],
        'cla__max_depth': [5, 8, 10]
    }
]   

gs = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',
    error_score='raise',
    cv=5,
    verbose=1,  # Set to 10 to print traces and know the % progress (very verbose)
    n_jobs=-1   # -1 uses all CPU cores; you can give a number > 0 to use that number of cores
)

Fit all variants and display the scores.

In [4]:
gs.fit(Spaceship_Titanic_data.train_data,
       Spaceship_Titanic_data.train_data.Transported)

result = pd.DataFrame(gs.cv_results_).sort_values(by='rank_test_score').reset_index(drop=True)

result

Fitting 5 folds for each of 132 candidates, totalling 660 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_cla,param_cla__C,param_cla__class_weight,param_cla__max_iter,param_preproc__imputer__num_imputer__strategy,param_preproc__scale_encode__minmax_scaler__feature_range,...,param_cla__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,6.771168,1.171175,0.054181,0.011710,"MLPClassifier(hidden_layer_sizes=(25,), max_it...",,,1500,mean,"(-1, 1)",...,,"{'cla': MLPClassifier(hidden_layer_sizes=(25,)...",0.876891,0.876379,0.873895,0.884220,0.887344,0.879746,0.005128,1
1,6.194542,1.349712,0.059627,0.015667,"MLPClassifier(hidden_layer_sizes=(25,), max_it...",,,1500,mean,"(-1, 1)",...,,"{'cla': MLPClassifier(hidden_layer_sizes=(25,)...",0.876465,0.879384,0.872711,0.881952,0.887468,0.879596,0.004995,2
2,9.307644,0.938977,0.043057,0.001795,"MLPClassifier(hidden_layer_sizes=(25,), max_it...",,,1500,median,"(0, 1)",...,,"{'cla': MLPClassifier(hidden_layer_sizes=(25,)...",0.879648,0.873247,0.872034,0.878514,0.893754,0.879439,0.007733,3
3,6.599462,1.868093,0.060165,0.020765,"MLPClassifier(hidden_layer_sizes=(25,), max_it...",,,1500,median,"(-1, 1)",...,,"{'cla': MLPClassifier(hidden_layer_sizes=(25,)...",0.870675,0.879425,0.876958,0.879827,0.889973,0.879372,0.006230,4
4,10.219923,1.365393,0.045205,0.001667,"MLPClassifier(hidden_layer_sizes=(25,), max_it...",,,1500,median,"(0, 1)",...,,"{'cla': MLPClassifier(hidden_layer_sizes=(25,)...",0.878566,0.873286,0.872421,0.878114,0.892982,0.879074,0.007381,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,0.167929,0.058305,0.053930,0.018985,DecisionTreeClassifier(),,,,mean,"(0, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.341983,0.703208,0.733690,0.842233,0.641190,0.652461,0.168351,128
128,0.144176,0.013922,0.054543,0.024999,DecisionTreeClassifier(),,,,median,"(0, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.385767,0.654293,0.731162,0.837144,0.608891,0.643452,0.150282,129
129,0.153028,0.015440,0.041838,0.003846,DecisionTreeClassifier(),,,,median,"(-1, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.383105,0.651302,0.729035,0.836746,0.609778,0.641993,0.150766,130
130,0.162532,0.026810,0.044388,0.003968,DecisionTreeClassifier(),,,,mean,"(0, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.350918,0.646818,0.724119,0.817722,0.601386,0.628193,0.156888,131


Choose the optimal model

In [5]:
optimal_model = gs.best_estimator_

optimal_model

Save the optimal model
----------------------

In [6]:
with open('model.jlb', 'wb') as file:
    joblib.dump(optimal_model, file)

Use the model to predict the test data
----------------------------------

In [7]:
processor = optimal_model.steps[0][1]
drop_target = optimal_model.steps[1][1]
classifier = optimal_model.steps[2][1]

# This call is necessary for the predict to work. For an unknown reason, if we don't do this, the model expects that
# the data has a 'Transported' column, and fails if not.
dummy = classifier.predict(drop_target.fit_transform(processor.fit_transform(Spaceship_Titanic_data.test_data)))

In [8]:
prediction = optimal_model.predict(Spaceship_Titanic_data.test_data)

Generate the output file as required by the Kaggle competition

In [9]:
output = pd.DataFrame({'PassengerId': Spaceship_Titanic_data.test_data['PassengerId'],
                       'Transported': prediction})
output.to_csv('submission.csv', index=False)

Output the result including passenger names

In [10]:
pd.DataFrame({'PassengerId': Spaceship_Titanic_data.test_data['PassengerId'],
              'Name': Spaceship_Titanic_data.test_data['Name'],
              'Transported': prediction})

Unnamed: 0,PassengerId,Name,Transported
0,0013_01,Nelly Carsoning,False
1,0018_01,Lerome Peckers,False
2,0019_01,Sabih Unhearfus,True
3,0021_01,Meratz Caltilter,True
4,0023_01,Brence Harperez,False
...,...,...,...
4272,9266_02,Jeron Peter,True
4273,9269_01,Matty Scheron,False
4274,9271_01,Jayrin Pore,True
4275,9273_01,Kitakan Conale,True
