Spaceship Titanic
=================

https://www.kaggle.com/competitions/spaceship-titanic/code?competitionId=34377

In [1]:
import pandas as pd
import joblib

import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# For this to work, you need to "File / Save and export notebook as... / Executable Script" the notebook
import Spaceship_Titanic_data

(train) Number of rows = 8693 and Number of cols = 14
(test) Number of rows = 4277 and Number of cols = 13


Create and train a model
------------------------

Create a model with the processing pipeline and one classifier.

In [2]:
model = Pipeline(
    [
        ('preproc', Spaceship_Titanic_data.preproc),
        ('drop_target', Spaceship_Titanic_data.drop_target),
        ('cla', LogisticRegression())
    ]
).set_output(transform='pandas')

Create a `GridSearchCV` to try many variants of the model, with different strategies and parameters, and find the combination with the best score.

In [3]:
param_grid = [
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (LogisticRegression(),),
        'cla__C': [0.5, 1.0, 5.0],
        'cla__max_iter': [1000],
        'cla__class_weight': [None, 'balanced']
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (KNeighborsClassifier(),),
        'cla__n_neighbors': [3, 5, 7],
        'cla__weights': ['uniform', 'distance']
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (MLPClassifier(),),
        'cla__hidden_layer_sizes': [(20,), (25,), (30,)],
        'cla__activation': ['logistic', 'relu'],
        'cla__max_iter': [1500]
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (DecisionTreeClassifier(),),
        'cla__criterion': ['gini', 'entropy'],
        'cla__max_depth': [5, 8, 10]
    },
    {
        'preproc__imputer__num_imputer__strategy': ['mean', 'median'],
        'preproc__scale_encode__minmax_scaler__feature_range': [(0, 1), (-1, 1)],
        'cla': (RandomForestClassifier(),),
        'cla__n_estimators': [50, 100, 150],
        'cla__max_depth': [5, 8, 10]
    }
]   

gs = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',
    error_score='raise',
    cv=5,
    verbose=1,  # Set to 10 to print traces and know the % progress (very verbose)
    n_jobs=-1   # -1 uses all CPU cores; you can give a number > 0 to use that number of cores
)

Fit all variants and display the scores.

In [4]:
gs.fit(Spaceship_Titanic_data.train_data,
       Spaceship_Titanic_data.train_data.Transported)

result = pd.DataFrame(gs.cv_results_).sort_values(by='rank_test_score').reset_index(drop=True)

result

Fitting 5 folds for each of 132 candidates, totalling 660 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_cla,param_cla__C,param_cla__class_weight,param_cla__max_iter,param_preproc__imputer__num_imputer__strategy,param_preproc__scale_encode__minmax_scaler__feature_range,...,param_cla__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,6.306261,0.367482,0.053406,0.021042,"MLPClassifier(hidden_layer_sizes=(25,), max_it...",,,1500,median,"(-1, 1)",...,,"{'cla': MLPClassifier(hidden_layer_sizes=(25,)...",0.875204,0.878358,0.878823,0.884467,0.890100,0.881391,0.005280,1
1,7.174269,2.495586,0.042048,0.003015,"MLPClassifier(hidden_layer_sizes=(25,), max_it...",,,1500,median,"(-1, 1)",...,,"{'cla': MLPClassifier(hidden_layer_sizes=(25,)...",0.871647,0.879488,0.876928,0.881838,0.890283,0.880037,0.006141,2
2,5.931722,1.075285,0.044203,0.004016,"MLPClassifier(hidden_layer_sizes=(25,), max_it...",,,1500,mean,"(-1, 1)",...,,"{'cla': MLPClassifier(hidden_layer_sizes=(25,)...",0.879403,0.880314,0.871409,0.881478,0.886975,0.879916,0.005001,3
3,6.887770,1.401121,0.065462,0.013582,"MLPClassifier(hidden_layer_sizes=(25,), max_it...",,,1500,mean,"(-1, 1)",...,,"{'cla': MLPClassifier(hidden_layer_sizes=(25,)...",0.872786,0.879585,0.874292,0.883199,0.888205,0.879614,0.005685,4
4,5.398352,1.075818,0.046823,0.008779,"MLPClassifier(hidden_layer_sizes=(25,), max_it...",,,1500,median,"(-1, 1)",...,,"{'cla': MLPClassifier(hidden_layer_sizes=(25,)...",0.875559,0.878640,0.872843,0.882006,0.888344,0.879478,0.005387,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,0.176391,0.073374,0.055217,0.012147,DecisionTreeClassifier(),,,,mean,"(-1, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.343963,0.704079,0.735242,0.844829,0.640109,0.653644,0.168427,128
128,0.179746,0.028448,0.062618,0.016472,DecisionTreeClassifier(),,,,median,"(-1, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.386391,0.652375,0.731697,0.836830,0.604667,0.642392,0.150227,129
129,0.170226,0.029372,0.053767,0.008847,DecisionTreeClassifier(),,,,median,"(0, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.382723,0.652560,0.731365,0.837375,0.606221,0.642049,0.151508,130
130,0.144712,0.012184,0.044735,0.004225,DecisionTreeClassifier(),,,,mean,"(0, 1)",...,,"{'cla': DecisionTreeClassifier(), 'cla__criter...",0.350007,0.646195,0.722504,0.819396,0.600013,0.627623,0.157453,131


Choose the optimal model

In [5]:
optimal_model = gs.best_estimator_

optimal_model

Save the optimal model
----------------------

In [6]:
with open('model.jlb', 'wb') as file:
    joblib.dump(optimal_model, file)

Use the model to predict the test data
----------------------------------

In [7]:
processor = optimal_model.steps[0][1]
drop_target = optimal_model.steps[1][1]
classifier = optimal_model.steps[2][1]

# This call is necessary for the predict to work. For an unknown reason, if we don't do this, the model expects that
# the data has a 'Transported' column, and fails if not.
dummy = classifier.predict(drop_target.fit_transform(processor.fit_transform(Spaceship_Titanic_data.test_data)))

In [8]:
prediction = optimal_model.predict(Spaceship_Titanic_data.test_data)

Generate the output file as required by the Kaggle competition

In [9]:
output = pd.DataFrame({'PassengerId': Spaceship_Titanic_data.test_data['PassengerId'],
                       'Transported': prediction})
output.to_csv('submission.csv', index=False)

Output the result including passenger names

In [10]:
pd.DataFrame({'PassengerId': Spaceship_Titanic_data.test_data['PassengerId'],
              'Name': Spaceship_Titanic_data.test_data['Name'],
              'Transported': prediction})

Unnamed: 0,PassengerId,Name,Transported
0,0013_01,Nelly Carsoning,False
1,0018_01,Lerome Peckers,False
2,0019_01,Sabih Unhearfus,True
3,0021_01,Meratz Caltilter,True
4,0023_01,Brence Harperez,False
...,...,...,...
4272,9266_02,Jeron Peter,True
4273,9269_01,Matty Scheron,False
4274,9271_01,Jayrin Pore,True
4275,9273_01,Kitakan Conale,True
