### Installing required modules

In [1]:
!pip3 install catboost



## Importing required modules

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

## Importing the train and test dataset and indexing it with PassengerId to sort the data.

In [3]:
train = pd.read_csv('../input/spaceship_titanic/spaceship_train.csv', index_col='PassengerId')
test = pd.read_csv('../input/spaceship_titanic/spaceship_test.csv', index_col='PassengerId')

In [4]:
# Previewing the Train Data and Test Data
print('Train set shape:', train.shape)
print('Test set shape:', test.shape)

Train set shape: (8693, 13)
Test set shape: (4277, 12)


In [5]:
train.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


### Dropping Name column from the train and test datasets

In [6]:
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

### Replacing 'Transported' column values with 0 for False and 1 for True

In [7]:
train['Transported'].replace(True, 1, inplace=True)
train['Transported'].replace(False, 0, inplace=True)

### Separating 'Cabin' column into 3 columns namely, deck, num, side

In [8]:
train[['deck','num', 'side']] = train['Cabin'].str.split('/', expand=True)
test[['deck','num', 'side']] = test['Cabin'].str.split('/', expand=True)

# Dropping the Cabin column
train.drop('Cabin', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)

### Creating a column 'Total_Spends' by adding the values of the columns 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'

In [9]:
columns_to_sum = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train['Total_Spends'] = train[columns_to_sum].sum(axis=1)
test['Total_Spends'] = test[columns_to_sum].sum(axis=1)

In [10]:
train

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side,Total_Spends
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0,B,0,P,0.0
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,1,F,0,S,736.0
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S,10383.0
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0,A,0,S,5176.0
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1,F,1,S,1091.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,0,A,98,P,8536.0
9278_01,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,0,G,1499,S,0.0
9279_01,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,1,G,1500,S,1873.0
9280_01,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,0,E,608,S,4637.0


In [11]:
test

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side,Total_Spends
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,3,S,0.0
0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,4,S,2832.0
0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,0,S,0.0
0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,1,S,7418.0
0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,5,S,645.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,Earth,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,G,1496,S,0.0
9269_01,Earth,False,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,,,,1018.0
9271_01,Mars,True,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,D,296,P,0.0
9273_01,Europa,False,,,False,0.0,2680.0,0.0,0.0,523.0,D,297,P,3203.0


In [12]:
train['Age_Group'] = pd.cut(train.Age, bins=[-1, 5, 13, 18, 60, 100], labels = ['Baby', 'Child', 'Teen', 'Adult', 'Elderly'])
train

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side,Total_Spends,Age_Group
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0,B,0,P,0.0,Adult
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,1,F,0,S,736.0,Adult
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S,10383.0,Adult
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0,A,0,S,5176.0,Adult
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1,F,1,S,1091.0,Teen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,0,A,98,P,8536.0,Adult
9278_01,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,0,G,1499,S,0.0,Teen
9279_01,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,1,G,1500,S,1873.0,Adult
9280_01,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,0,E,608,S,4637.0,Adult


In [13]:
test['Age_Group'] = pd.cut(test.Age, bins=[-1, 5, 13, 18, 60, 100], labels = ['Baby', 'Child', 'Teen', 'Adult', 'Elderly'])
test

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side,Total_Spends,Age_Group
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,3,S,0.0,Adult
0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,4,S,2832.0,Adult
0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,0,S,0.0,Adult
0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,1,S,7418.0,Adult
0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,5,S,645.0,Adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,Earth,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,G,1496,S,0.0,Adult
9269_01,Earth,False,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,,,,1018.0,Adult
9271_01,Mars,True,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,D,296,P,0.0,
9273_01,Europa,False,,,False,0.0,2680.0,0.0,0.0,523.0,D,297,P,3203.0,


In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   HomePlanet    8492 non-null   object  
 1   CryoSleep     8476 non-null   object  
 2   Destination   8511 non-null   object  
 3   Age           8514 non-null   float64 
 4   VIP           8490 non-null   object  
 5   RoomService   8512 non-null   float64 
 6   FoodCourt     8510 non-null   float64 
 7   ShoppingMall  8485 non-null   float64 
 8   Spa           8510 non-null   float64 
 9   VRDeck        8505 non-null   float64 
 10  Transported   8693 non-null   int64   
 11  deck          8494 non-null   object  
 12  num           8494 non-null   object  
 13  side          8494 non-null   object  
 14  Total_Spends  8693 non-null   float64 
 15  Age_Group     8514 non-null   category
dtypes: category(1), float64(7), int64(1), object(7)
memory usage: 1.1+ MB


In [15]:
# Checking if Train dataset has null values
train.isna().sum()

HomePlanet      201
CryoSleep       217
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
deck            199
num             199
side            199
Total_Spends      0
Age_Group       179
dtype: int64

In [16]:
# Checking if Test dataset has null values
test.isna().sum()

HomePlanet       87
CryoSleep        93
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
deck            100
num             100
side            100
Total_Spends      0
Age_Group        91
dtype: int64

In [17]:
# Assigning all the null values to variable 'null_columns' 
null_columns = train.isnull().sum().sort_values(ascending=False)
null_columns = list(null_columns[null_columns>1].index)
null_columns

['CryoSleep',
 'ShoppingMall',
 'VIP',
 'HomePlanet',
 'deck',
 'num',
 'side',
 'VRDeck',
 'FoodCourt',
 'Spa',
 'Destination',
 'RoomService',
 'Age',
 'Age_Group']

## Encoding and Cleaning the Data

In [18]:
# Assigning Object type data to variable object_columns and float64 type data to numercic_columns
object_columns = [column for column in train.columns if train[column].dtype == 'object' or train[column].dtype == 'category']
numeric_columns = [column for column in train.columns if train[column].dtype == 'float64']

print(f'Object Columns : {object_columns}')
print(f'Numeric Columns : {object_columns}')

Object Columns : ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'deck', 'num', 'side', 'Age_Group']
Numeric Columns : ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'deck', 'num', 'side', 'Age_Group']


In [19]:
# Replacing object_columns of train and test data to be of type category 
train[object_columns] = train[object_columns].astype('category')
test[object_columns] = test[object_columns].astype('category')

In [20]:
# Encoding categorical values from objects or strings to float or numeric values for use in Classification
encoder = OrdinalEncoder()

encoded_data = pd.concat([train, test])

encoded_data[object_columns] = encoded_data[object_columns].astype('category')

encoded_data[object_columns] = encoder.fit_transform(encoded_data[object_columns])

del train, test

train = encoded_data.iloc[:8693, :]
test = encoded_data.iloc[8693: , :]

del encoded_data

In [21]:
# Dropping 'Transported' column
test.drop('Transported', inplace = True, axis = 1)

In [22]:
test

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side,Total_Spends,Age_Group
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0013_01,0.0,1.0,2.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1117.0,1.0,0.0,0.0
0018_01,0.0,0.0,2.0,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,5.0,1228.0,1.0,2832.0,0.0
0019_01,1.0,1.0,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0
0021_01,1.0,0.0,2.0,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,2.0,1.0,1.0,7418.0,0.0
0023_01,0.0,0.0,2.0,20.0,0.0,10.0,0.0,635.0,0.0,0.0,5.0,1339.0,1.0,645.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,0.0,1.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,553.0,1.0,0.0,0.0
9269_01,0.0,0.0,2.0,42.0,0.0,0.0,847.0,17.0,10.0,144.0,,,,1018.0,0.0
9271_01,2.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1113.0,0.0,0.0,
9273_01,1.0,0.0,,,0.0,0.0,2680.0,0.0,0.0,523.0,3.0,1114.0,0.0,3203.0,


In [23]:
# Replacing missing values with mean value
col_transfer = ColumnTransformer([("imp", SimpleImputer(strategy='mean'), null_columns)])
    
train[null_columns] = col_transfer.fit_transform(train[null_columns])
test[null_columns] = col_transfer.fit_transform(test[null_columns])

## Training the models

In [24]:
X = train.copy()
y = X.pop('Transported')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=32)

In [25]:
# Method to predict accuracy by applying 4-Fold Cross Validation
model_list= {}
def predict_accuracy(model, verbose=None):
    
    if verbose == None:
        model = model()
        model.fit(X_train, y_train)
        predict = model.predict(X_test)
        cvs = cross_val_score(model, X, y, cv=4)
        print(f'The accuracy of {str(model)} is {float(accuracy_score(y_test, predict))}')
        print(f'The cross validation of {str(model)} is:{cvs} with mean of {cvs.mean()}')
    else:
        model = model(verbose=verbose)
        model.fit(X_train, y_train)
        predict = model.predict(X_test)
        cvs = cross_val_score(model, X, y, cv=4)
        print(f'The accuracy of {str(model)} is {float(accuracy_score(y_test, predict))}')
        print(f'The cross validation of {str(model)} is:{cvs} with mean of {cvs.mean()}')

In [26]:
predict_accuracy(RandomForestClassifier, None)

The accuracy of RandomForestClassifier() is 0.7934682612695492
The cross validation of RandomForestClassifier() is:[0.76770929 0.75885872 0.80533824 0.79521399] with mean of 0.7817800610571068


In [27]:
predict_accuracy(AdaBoostClassifier)

The accuracy of AdaBoostClassifier() is 0.7888684452621895
The cross validation of AdaBoostClassifier() is:[0.74931003 0.78416935 0.79337322 0.80901979] with mean of 0.7839680959471239


In [28]:
predict_accuracy(XGBClassifier)

The accuracy of XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...) is 0.8035878564857406
The cross validation of XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eva

In [29]:
predict_accuracy(CatBoostClassifier, verbose=False)

The accuracy of <catboost.core.CatBoostClassifier object at 0x7fc7912c11f0> is 0.8104875804967802
The cross validation of <catboost.core.CatBoostClassifier object at 0x7fc7912c11f0> is:[0.7649494  0.78232858 0.81914404 0.79981592] with mean of 0.7915594858028043


##### Backward feature selection with CatBoostClassifier

In [30]:
model_feature_selection = CatBoostClassifier(verbose=False)
seq_feature = SequentialFeatureSelector(model_feature_selection, scoring='accuracy', direction = 'backward', n_features_to_select='auto', tol=None)
seq_feature.fit(X,y)

In [31]:
best_features = list(seq_feature.get_feature_names_out())
best_features

['CryoSleep',
 'Destination',
 'RoomService',
 'Spa',
 'VRDeck',
 'deck',
 'side',
 'Total_Spends']

##### Retraining the model with best_features of X (train set) to obtain the accuracy of the final model

In [32]:
model = CatBoostClassifier(verbose=False, eval_metric='Accuracy')
model.fit(X[best_features], y)
prediction = model.predict(test[best_features])

## Submitting the predictions 

In [33]:
final_predictions = pd.DataFrame()
final_predictions.index = test.index
final_predictions['Transported'] = prediction
final_predictions['Transported'].replace(1, True, inplace=True)
final_predictions['Transported'].replace(0, False, inplace=True)
final_predictions

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


In [35]:
final_predictions.to_csv('submission-spaceship-titanic-final-1.csv')