# Subject: space Titanic prediction 
# Link: https://www.kaggle.com/competitions/spaceship-titanic/data?select=sample_submission.csv
# Author: Pierre Mulliez 
# Date start: 28/11/2022
# Description: classify passengers as transported successfully (TRUE) or not (FALSE)

In [466]:
# import packages 
import pandas as pd 
import numpy as np
from datetime import datetime
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [467]:
# import data and print summaries
titanic_df = pd.read_csv("data/train.csv")
raw_Kaggle = pd.read_csv("data/test.csv")
print(titanic_df.head(5))
print('------------------')
print('dataframe shape: {}'.format(titanic_df.shape))
print('------------------')
print(titanic_df.describe())

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  
--

In [468]:
# baseline model without feature engineering all TRUE 
test = [1 for count in range(0,len(titanic_df))]
no_feature_eng_error = accuracy_score(titanic_df.loc[:,'Transported'].factorize()[0],test)
print('baseline accuracy without feature engineering is {}'.format(round(no_feature_eng_error,2)))

baseline accuracy without feature engineering is 0.5


In [469]:
# null values 
print(titanic_df.isnull().sum())
#titanic_df = titanic_df.dropna()
titanic_df.loc[titanic_df['Destination'].isnull() == True,:].head(15)

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
47,0045_02,Mars,True,F/10/P,,19.0,False,0.0,0.0,0.0,0.0,0.0,Mass Chmad,True
128,0138_02,Earth,False,E/5/P,,34.0,False,0.0,22.0,0.0,564.0,207.0,Monah Gambs,False
139,0152_01,Earth,False,F/32/P,,41.0,False,0.0,0.0,0.0,0.0,607.0,Andan Estron,False
347,0382_01,,False,G/64/P,,23.0,False,348.0,0.0,0.0,4.0,368.0,Blanie Floydendley,False
430,0462_01,Earth,True,G/67/S,,50.0,False,0.0,0.0,0.0,0.0,0.0,Ronia Sosanturney,False
468,0504_03,Europa,True,B/19/S,,24.0,False,0.0,0.0,0.0,0.0,0.0,Sterops Unpasine,True
529,0564_03,Europa,False,B/24/P,,28.0,False,0.0,410.0,2.0,1354.0,3695.0,Aldibi Dinsprody,False
540,0571_04,Europa,True,C/21/P,,30.0,False,0.0,0.0,0.0,0.0,,Izark Hednigic,True
547,0576_01,Earth,False,F/107/S,,21.0,False,0.0,,625.0,110.0,0.0,Melice Herry,False
570,0597_01,Earth,True,G/91/P,,23.0,False,0.0,0.0,0.0,0.0,0.0,Arllia Roachoanand,True


In [470]:
# train test split
# drop unessesary collumns
y = titanic_df.loc[:,'Transported']
X = titanic_df.loc[:,(titanic_df.columns != 'Transported')]

In [471]:
#target encoding
def encode(X):
    # cabin work 
    X['cabin_1letter'] = [X.loc[nber,'Cabin'].split('/')[0] if type(X.loc[nber,'Cabin']) == str else next for nber in range(0,len(X['Cabin']))]
    X['cabin_number'] = [int(X.loc[nber,'Cabin'].split('/')[1]) if type(X.loc[nber,'Cabin']) == str else 0 for nber in range(0,len(X['Cabin']))]
    X['cabin_3letter'] = [X.loc[nber,'Cabin'].split('/')[2] if type(X.loc[nber,'Cabin']) == str else next for nber in range(0,len(X['Cabin'])) ]
    X = X.drop('Cabin',axis = 1)
   #id work 
    X['id_1'] = [int(X.loc[nber,'PassengerId'].split('_')[0]) if type(X.loc[nber,'PassengerId']) == str else 0 for nber in range(0,len(X['PassengerId']))]
    X['id_2'] = [int(X.loc[nber,'PassengerId'].split('_')[1]) if type(X.loc[nber,'PassengerId']) == str else 0 for nber in range(0,len(X['PassengerId']))]
    X = X.drop('PassengerId',axis = 1)
    
    #family names 
    X['FamilyName'] = [X.loc[nber,'Name'].split(' ')[0] if type(X.loc[nber,'Name']) == str else next for nber in range(0,len(X['Name']))]
    X = X.drop('Name',axis = 1)
    
    #boolean to int
    for colname in ('CryoSleep','VIP'):
        X[colname] = [int(1) if X.loc[el,colname] == 'True' else 0 for el in range(0,len(X[colname]))]
        
    #factorize the rest
    for colname in X.select_dtypes("object"):
        X[colname], _ = X[colname].factorize()
        
    return X

In [472]:
def kmean_pipe(X):
    for colname in X.select_dtypes("object"):
        X[colname], _ = X[colname].factorize()
    for colname in X.select_dtypes("float"):
        X[colname] = scale(X[colname])
    return X

In [473]:
### null values 

# values to predict as keys, independent varaibles as values 
xy_nulls = {'Age':('RoomService','Spa','FoodCourt'),
            'VIP':('Age','RoomService','Spa'),
             'HomePlanet':('Age','RoomService','VIP'),
            'Destination':('HomePlanet','Age'),
            'CryoSleep':('HomePlanet', 'Destination','Age'),
             'VRDeck':('Age','RoomService','VIP'),
            'ShoppingMall':('RoomService','Spa','FoodCourt','VIP','Age')
               }
ys = list(xy_nulls.keys())

def replace_nulls(data,train_variables,collumn_predict):
    
    # if nulls in train variable mean 
    for train_var in train_variables:
        if  type(data.loc[data[train_var].isnull() == False,train_var][0] ) == str:
            data.loc[data[train_var].isnull() == True,train_var] = data[train_var].mode()
        else:
            data.loc[data[train_var].isnull() == True,train_var] = data[train_var].mean()
    
    # get variable to predict
    data_pred = data.loc[data[collumn_predict].isnull() == True,:]
    data_train = data.loc[data[collumn_predict].isnull() == False,:]
    
    if len(data_pred) == 0:
        #escape if no null values
        return data
    
    X = data_train.loc[:,train_variables]
    y = data_train[collumn_predict]
    X = kmean_pipe(X)
    if  type(data_train.loc[0,collumn_predict]) == str:
        null_prediction = KMeans(n_clusters=6).fit(X, y)
    else:
        null_prediction = LinearRegression().fit(X, y)
    
    data_pred = kmean_pipe(data_pred.loc[:,train_variables].copy())
    #replace null values 
    data.loc[data[collumn_predict].isnull() == True,collumn_predict] = null_prediction.predict(data_pred.loc[:,train_variables])
    return data

In [474]:
# scaling
def scale(X):
    X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
    return X_scaled

In [475]:
def clustering(X,col = ('Age','VIP','cabin_1letter')):
    kmeans = KMeans(n_clusters=6)
    data_pred = kmean_pipe(X.loc[:,col].copy())
    X["Cluster"] = kmeans.fit_predict(data_pred.loc[:,col])
    X["Cluster"] = X["Cluster"].astype("category")
    return X

In [476]:
X = encode(X)
for ys_unit in ys:
    X_cleaned = replace_nulls(X, xy_nulls[ys_unit],ys_unit)
X_enriched = clustering(X_cleaned)

In [477]:
X_train, X_test, y_train, y_test = train_test_split(X_enriched, y, test_size=0.3, random_state=42)

In [478]:
#grid search for optimal parameters
param_grid = { 
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2,3,4,6],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv= 4)

In [479]:
CV_rfc.fit(X_train,y_train)
CV_rfc.best_params_

{'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt'}

In [480]:
bestforest = RandomForestClassifier(criterion  = CV_rfc.best_params_['criterion'], 
                                    max_depth = CV_rfc.best_params_['max_depth'], 
                                    max_features = CV_rfc.best_params_['max_features'])
bestforest.fit(X_train,y_train)
predictions = bestforest.predict(X_test)
first_model_error = accuracy_score(y_test,predictions)
print('Random forest accuracy:  {}'.format(round(first_model_error,2)))

Random forest accuracy:  0.79


In [486]:
titanic_df_Kaggle = encode(raw_Kaggle)
for ys_unit in ys:
    titanic_df_Kaggle = replace_nulls(titanic_df_Kaggle, xy_nulls[ys_unit],ys_unit)
titanic_df_Kaggle_clustered = clustering(titanic_df_Kaggle)
titanic_df_Kaggle_submit = bestforest.predict(titanic_df_Kaggle_clustered)

In [487]:
#final result 
submit = pd.DataFrame(raw_Kaggle.loc[:,'PassengerId'])
submit['Transported'] = titanic_df_Kaggle_submit
print(submit.head(15))
submit.to_csv('./data/submit_{}.csv'.format(datetime.today().strftime("%d_%m_%Y")),index=False)

   PassengerId  Transported
0      0013_01         True
1      0018_01        False
2      0019_01         True
3      0021_01        False
4      0023_01         True
5      0027_01         True
6      0029_01         True
7      0032_01         True
8      0032_02         True
9      0033_01         True
10     0037_01        False
11     0040_01        False
12     0040_02        False
13     0042_01         True
14     0046_01        False


In [None]:
# last score 0.77 