In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import (SimpleImputer, IterativeImputer)
from sklearn.preprocessing import (OneHotEncoder, StandardScaler)
from sklearn.model_selection import (GridSearchCV, cross_val_score)
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.cluster import KMeans
from category_encoders import MEstimateEncoder
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier

In [None]:
# Load data
full_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Analyze data

In [None]:
full_df.head()

In [None]:
# Separate test_df PassengerId (will need it for submission)
test_pass_id = test_df.PassengerId.copy()

# Keep max index that will be used to back split training and test data
X_max_index = full_df.shape[0]

# Separate features and target
y = full_df.Transported.astype(int)

df = full_df.drop(['Transported'], axis=1).copy()
df = pd.concat([df, test_df], axis=0).reset_index(drop=True)

df.info()

# Unprocessed data correlation

In [None]:
full_df.corr()['Transported'].sort_values(ascending=False)

# Features' instances

In [None]:
df.hist(bins=30, figsize=(12, 8))
plt.show()

# Process data

In [None]:
# Split passenger group into separate feature
df['Pass_group'] = df.PassengerId.str.split('_').str[0]
df.Pass_group = df.Pass_group.astype(float)

# Split Lastname into separate feature
df['Lastname'] = df.Name.str.split(' ').str[1]

# Split Cabin into 3 separate features
df[['Deck', 'Cab_num', 'Deck_side']] = df.Cabin.str.split('/', expand=True)
df.Cab_num = df.Cab_num.astype(float)

# Dealing with the outliers

In [None]:
# Looking for outliers
fig, axes = plt.subplots(2, 3, sharey=True, figsize=(15, 7))
df.plot.scatter(x='RoomService', y='Age', ax=axes[0, 0])
df.plot.scatter(x='FoodCourt', y='Age', ax=axes[0, 1])
df.plot.scatter(x='ShoppingMall', y='Age', ax=axes[0, 2])
df.plot.scatter(x='Spa', y='Age', ax=axes[1, 0])
df.plot.scatter(x='VRDeck', y='Age', ax=axes[1, 1])
plt.show()

In [None]:
# Cap RoomService at 9000
df.loc[df.RoomService.gt(9000), 'RoomService'] = 9000

# Cap FoodCourt at 22000
df.loc[df.FoodCourt.gt(22000), 'FoodCourt'] = 22000

# Cap ShoppingMall at 11000
df.loc[df.ShoppingMall.gt(11000), 'ShoppingMall'] = 11000

# Cap Spa at 17000
df.loc[df.Spa.gt(17000), 'Spa'] = 17000

# Cap VRDeck at 21000
df.loc[df.VRDeck.gt(21000), 'VRDeck'] = 21000

# Run previous cell again to visualize the result

# Impute CryoSleep

In [None]:
# Check if the passengers in CryoSleep have always zero values for amenities
amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

df.loc[(df.CryoSleep == True), amenities].sum()

In [None]:
# Impute CryoSleep = False where amenities has at least one value different than 0
pos_amenities = df[amenities].sum(axis=1) != 0
df.loc[(df.CryoSleep.isna() & pos_amenities), 'CryoSleep'] = False

# Impute amenities

In [None]:
# For passengers in CryoSleep we impute zero for missing amenities values
df.loc[(df.CryoSleep == True), amenities] = 0

# Where summ of amenities is 0 we also impute 0 for missing values
zero_amenities = (df[amenities].sum(axis=1) == 0)
df.loc[zero_amenities, amenities] = 0

# Impute median values greater than 0 for each amenities column where passengers are not in CryoSleep
for i in amenities: 
    df.loc[(df[i].isna() & df.CryoSleep.eq(False)), i] = df[df[i].gt(0)].median(numeric_only=True)

# Impute VIP

In [None]:
# Check the minimum Age for passengers with VIP status
df.groupby('VIP').Age.min()

Underage passengers don't have VIP status

In [None]:
# Check VIPs by HomePlanet and Destination
df.groupby(['HomePlanet', 'Destination']).VIP.value_counts()

- There are a lot of passengers from Earth, but no VIPs
- There are no VIPs between passengers from Mars that goes to Cancri 
- VIPs going to Cancri are just from Europa

In [None]:
# Impute VIP = False for underage passengers
df.loc[(df.VIP.isna() & (df.Age < 18)), 'VIP'] = False

In [None]:
# Impute VIP = False for passengers from Earth
df.loc[(df.VIP.isna() & (df.HomePlanet == 'Earth')), 'VIP'] = False

# Impute VIP = False for passengers from Mars that goes to Cancri
df.loc[(df.VIP.isna() 
        & (df.HomePlanet.eq('Mars'))
        & (df.Destination.eq('55 Cancri e'))), 'VIP'] = False

In [None]:
# Check VIPs and CryoSleep by Deck
df.groupby(['VIP', 'CryoSleep']).Deck.value_counts()

- There are no VIPs on decks G and T
- VIPs in CryoSleep are just on the decks A to D

In [None]:
# Impute VIP = False for passengers on deck G and T
df.loc[(df.VIP.isna() 
        & (df.Deck.isin(['G', 'T']))), 'VIP'] = False

# Impute VIP = True for passengers that are not on decks A to D and not in CryoSleep
df.loc[df.VIP.isna() 
       & df.CryoSleep.eq(False) 
       & ~df.Deck.isin(['A', 'B', 'C', 'D']), 'VIP'] = True

# Impute HomePlanet

In [None]:
# From previous analysis VIPs going to Cancri are just from Europa
df.loc[(df.HomePlanet.isna() 
        & df.VIP.eq(True) 
        & df.Destination.eq('55 Cancri e')), 'HomePlanet'] = 'Europa'

In [None]:
# Check if all the passengers from the same group have departed from the same HomePlanet
df.groupby('Pass_group').HomePlanet.nunique().gt(1).sum()

In [None]:
# Create a Pass_group-HomePlanet dictionary
present_values = ~df.Pass_group.isna() & ~df.HomePlanet.isna()
group_home_map = (df.loc[present_values, ['Pass_group', 'HomePlanet']]
            .set_index('Pass_group').to_dict()['HomePlanet'])

# Impute HomePlanet from dictionary
df.loc[df.HomePlanet.isna(), 'HomePlanet'] = df.Pass_group.map(group_home_map)

In [None]:
# Analyse HomePlanet-Deck distribution
df.groupby('HomePlanet').Deck.value_counts()

- Decks T, A, B, C are just from Europa
- Deck G is just from Earth


In [None]:
# Impute HomePlanet based on previous distribution analysis
df.loc[(df.HomePlanet.isna() 
        & df.Deck.isin(['T', 'A', 'B', 'C',])), 'HomePlanet'] = 'Europa'

df.loc[(df.HomePlanet.isna() & df.Deck.eq('G')), 'HomePlanet'] = 'Earth'

df.loc[(df.HomePlanet.isna() & df.Deck.eq('G')), 'HomePlanet'] = 'Earth'

In [None]:
# Check if all the passengers with the same Lastname have departed from the same HomePlanet
df.groupby('Lastname').HomePlanet.nunique().gt(1).sum()

In [None]:
# Create a Lastname-HomePlanet dictionary
present_values = ~df.Lastname.isna() & ~df.HomePlanet.isna()
lastname_home_map = (df.loc[present_values, ['Lastname', 'HomePlanet']]
            .set_index('Lastname').to_dict()['HomePlanet'])

# Impute HomePlanet from dictionary
df.loc[df.HomePlanet.isna(), 'HomePlanet'] = df.Lastname.map(lastname_home_map)

# Impute Age

In [None]:
# Impute median Age for people with VIP status
df.loc[((df.VIP == True) & df.Age.isna()), 'Age'] = df.loc[(df.VIP == True), 'Age'].median()

In [None]:
# Check the minimum Age for passengers that have expenses for amenities
df.loc[pos_amenities, 'Age'].min()

In [None]:
# Impute median Age for people with amenities expenses
df.loc[(df.Age.isna() & pos_amenities), 'Age'] = df.loc[pos_amenities, 'Age'].median()

# Impute median Age for people with zero amenities expenses and not in CryoSleep (passengers under 13yo)
df.loc[(df.Age.isna() & df.CryoSleep.eq(False)), 'Age'] = df.loc[zero_amenities, 'Age'].median()

# Impute an overall median Age for people with zero amenities expenses and in CryoSleep
df.loc[(df.Age.isna() & df.CryoSleep.eq(True)), 'Age'] = df.Age.median()

# Create Total_expenses

In [None]:
# We will calculate the total expenses the passenger had on amenities
df['Total_expenses'] = df[amenities].sum(axis=1)

# Create Group_members

In [None]:
# We count the number of passengers in each group
Group_members = df.Pass_group.value_counts().to_dict()
df['Group_members'] = df.Pass_group.map(Group_members)

# Create Cabin_members

In [None]:
# We count the number of members in each cabin
Cabin_members = df.Cabin.value_counts().to_dict()
df['Cabin_members'] = df.Cabin.map(Cabin_members)

# Impute mean for missing cabin
df.Cabin_members.fillna(df.Cabin_members.mean(), inplace=True)

# Create Deck_transp_ratio

In [None]:
# Create full data frame for analysis
X = df[:X_max_index]
test_df = df[X_max_index:]
full_df = pd.concat([X, y], axis=1).copy()

# Total passengers by Deck
deck_total_pass = full_df.groupby('Deck').Deck.count()

# Total Transported by Deck
deck_total_transported = full_df.groupby('Deck').Transported.sum()

# Dictionary with Deck_transp_ratio
Deck_transp_ratio = (deck_total_transported / deck_total_pass).to_dict()

# Create Deck_transp_ratio
df['Deck_transp_ratio'] = df.Deck.map(Deck_transp_ratio)
df.Deck_transp_ratio.fillna(df.Deck_transp_ratio.mean(), inplace=True)

# Create Deck_side_transp_ratio

In [None]:
# Total passengers by Deck_side
deck_side_total = full_df.groupby('Deck_side').Deck.count()

# Total Transported by Deck_side
deck_side_transported = full_df.groupby('Deck_side').Transported.sum()

# Dictionary with Deck_side_transp_ratio
Deck_side_transp_ratio = (deck_side_transported / deck_side_total).to_dict()

# Create Deck_side_transp_ratio
df['Deck_side_transp_ratio'] = df.Deck_side.map(Deck_side_transp_ratio)
df.Deck_side_transp_ratio.fillna(df.Deck_side_transp_ratio.mean(), inplace=True)

In [None]:
# Drop further unused columns
col_drop = ['PassengerId', 'Cabin', 'Name', 'Lastname']
df = df.drop(col_drop, axis=1)

# Impute and encode categorical features

In [None]:
# List of categorical columns
categ_cols = list(df.select_dtypes(['object', 'category']).columns)

# Impute categoricals with most frequent value
cat_imputer = SimpleImputer(strategy='constant', fill_value='Missing')

df_cat = pd.DataFrame(cat_imputer.fit_transform(df[categ_cols]), 
                      columns=df[categ_cols].columns)

# Encode categoricals
df_cat = pd.get_dummies(df_cat)

# Impute numericals

In [None]:
# List of numerical columns
num_cols = list(df.select_dtypes(['int64', 'float64']).columns)

# Concatenate with encoded categorical columns
df = pd.concat([df_cat, df[num_cols]], axis=1)

# Impute numericals
it_imp = IterativeImputer()
df = pd.DataFrame(it_imp.fit_transform(df), columns=df.columns)

# Feature engineering

In [None]:
# Create a full data frame for analysis
X = df[:X_max_index]
full_df = pd.concat([X, y], axis=1)
full_df['Non_zero_expenses'] = full_df.loc[full_df.Total_expenses.gt(0), 'Total_expenses']

In [None]:
# Function for kde plotting
def transported_chance_by(feature, xticks=None, xlim=None):
    transported = full_df[full_df.Transported == 1]
    not_transported = full_df[full_df.Transported == 0]

    plt.figure(figsize=(10, 5))

    transported[feature].plot(kind='kde', label='transported')
    not_transported[feature].plot(kind='kde', label='not_transported')
    
    plt.xlim(xlim)
    plt.xticks(xticks)
    plt.legend()
    plt.grid()
    plt.xlabel(feature)
    plt.show()

# Create Age_group feature

In [None]:
# Transported by Age
transported_chance_by('Age', np.arange(0, 81, 5), (0, 80))

By curves intersection points we can separate 3 age groups

In [None]:
# Create Age_group feature
df['Age_group'] = pd.cut(x=df.Age, labels=[1, 3, 2],
                         bins=[-1, 17, 43, df.Age.max()]).astype('float')

# Create Total_expenses_group feature

In [None]:
# Transported by Non_zero_expenses
transported_chance_by('Non_zero_expenses', np.arange(0, 9000, 500), (0, 9000))

By curves intersection points we can separate 2 Total_expenses groups

In [None]:
# Create Total_expenses_group feature additionally separating 0 expenses group
df['Total_expenses_group'] = pd.cut(x=df.Total_expenses, labels=[3, 1, 2],
                         bins=[-1, 1, 2200, df.Total_expenses.max()]).astype('float')

# Create Cab_group feature

In [None]:
# Transported by Cab_num
transported_chance_by('Cab_num', np.arange(0, 2000, 100), (0, 2000))

By curves intersection points we can separate 4 cabin groups

In [None]:
# Create Cab_group feature
df['Cab_group'] = pd.cut(x=df.Cab_num, labels=[3, 2, 4, 1],
                         bins=[-1, 300, 700, 1170, df.Cab_num.max()]).astype('float')

# Create Pass_group_type feature

In [None]:
# Transported by Pass_group
transported_chance_by('Pass_group', np.arange(0, 10000, 500), (0, 10000))

By curves intersection points we can separate 3 passengers group type

In [None]:
# Create Pass_group_type feature
df['Pass_group_type'] = pd.cut(x=df.Pass_group, labels=[2, 3, 1], 
                               bins=[-1, 3400, 7300, df.Pass_group.max()]).astype('float')

# Standardization

In [None]:
# Apply np.log to normalize the skewed right data
skewed_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 
                   'Spa', 'VRDeck', 'Total_expenses']
df[skewed_features] = df[skewed_features].apply(np.log1p)

# Standardize 
std_scaler = StandardScaler()

df_scaled = std_scaler.fit_transform(df)
df = pd.DataFrame(df_scaled, columns=df.columns)

In [None]:
# Drop columns not used for modeling
col_drop = ['Cab_num', 'Pass_group']
df = df.drop(col_drop, axis=1)

# Split train and test data

In [None]:
X = df[:X_max_index]
test_df = df[X_max_index:]

# Processed data correlation

In [None]:
# Concatenate into a full dataset
full_df = pd.concat([X, y], axis=1)

correlation = full_df.corr()['Transported'].sort_values(ascending=False)

# Correlation graph
correlation[1:].plot(kind='bar', figsize=(10,5), title='Transported dependency')
plt.show()

On Spaceship Titanic is better to not be a sleeping passenger from Europa or to have an all inclusive package with no extra expenses.

# Find best features
This cell is commented out as it takes about 8 hours to run and the resulted final_features are shown further

In [None]:
# # Define model
# cat_model = CatBoostClassifier(thread_count=-1, verbose=False)

# # Define and fit feature selector
# sfs = SequentialFeatureSelector(cat_model, 
#                                 scoring='accuracy', 
#                                 direction = 'backward')
# sfs.fit(X,y)

# # List of the final features to be used for submission modeling
# final_features = list(sfs.get_feature_names_out())

In [None]:
# From Feature selector we've got this list of final features to use
final_features = [
    'HomePlanet_Earth', 'HomePlanet_Europa', 'CryoSleep_False', 'Destination_TRAPPIST-1e', 
    'Deck_A', 'Deck_C', 'Deck_G', 'Deck_T', 'Age', 'RoomService', 'ShoppingMall', 'Spa', 
    'VRDeck', 'Total_expenses', 'Group_members', 'Deck_transp_ratio', 
    'Deck_side_transp_ratio', 'Age_group', 'Pass_group_type', 'Cab_group', 'Total_expenses_group']

# CatBoost grid search parameter tuning
This cell is commented out as it takes about 3 hours to run and the resulted parameters are shown further

In [None]:
# # Define model
# cat_model = CatBoostClassifier()

# # Define parameters' grid
# grid = {'verbose': [False],
#         'thread_count': [-1],
#         'depth': [4, 5, 6, 7, 8],
#         'iterations': [1000, 2000, 3000, 5000, 10000],
#         'learning_rate': [0.001, 0.005, 0.01, 0.02, 0.03]}

# # Define GridSearchCV
# grid_cat = GridSearchCV(estimator=cat_model, param_grid=grid, cv=3, n_jobs=-1)
# grid_cat.fit(X[final_features], y)

# print('Results from Grid Search')
# print('\n Best Score:\n', grid_cat.best_score_)
# print('\n Best parameters:\n', grid_cat.best_params_)

In [None]:
# Define best parameters
params =  {'depth': 5, 
           'iterations': 2000, 
           'learning_rate': 0.01, 
           'thread_count': -1, 
           'verbose': False}

# Final model

In [None]:
# Define and fit the final model
cat_model = CatBoostClassifier(**params)
cat_model.fit(X, y)

# Check accuracy and features importance
cat_rmses = cross_val_score(cat_model, X, y, cv=3)

print(pd.Series(cat_rmses).describe())
print('\n', cat_model.get_feature_importance(prettified=True))

# Submission


In [None]:
# Make predictions which we will submit.
test_preds = cat_model.predict(test_df)

# Save predictions in the format used for competition scoring
output = pd.DataFrame({'PassengerId': test_pass_id,
                       'Transported': test_preds.astype(bool)})

output.to_csv('submission.csv', index=False)