In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from copy import copy


In this competition your task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. To help you make these predictions, you're given a set of personal records recovered from the ship's **damaged** computer system.

`PassengerId` - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. **People in a group are often family members, but not always**.


`HomePlanet` - The planet the passenger departed from, typically their planet of permanent residence.


`CryoSleep` - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.


`Cabin` - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.


`Destination` - The planet the passenger will be debarking to.


`Age` - The age of the passenger.


`VIP` - Whether the passenger has paid for special VIP service during the voyage.


`RoomService`, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.


`Name` - The first and last names of the passenger.


`Transported` - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

# Loading data + preprocessing

In [2]:
df_orig = pd.read_csv('train.csv')
#df_orig = pd.read_csv('data/train.csv')


def preprocess_df(df_orig, dropna = True):
    df_orig = df_orig.copy()
    df_orig.replace({False: 0, True: 1}, inplace=True)
    
    if dropna:
        print('DROPPING NANs')
        df_orig = df_orig.dropna()
    else:    
        print('FILLING NANs with median')
        df_orig = df_orig.fillna(df_orig.median())
        
    df_orig.head()

    #grouping data and making new columns
    df = df_orig.copy()

    #get deck, number and side
    df[['deck','num', 'side']] = df_orig['Cabin'].str.split('/', expand=True)

    #get passenger group, and put it into integer format
    df['Passenger_group'] = df_orig['PassengerId'].str.split('_', expand=True).loc[:, 0]
    df['Passenger_group'] = df['Passenger_group'].apply(lambda gr_str: int(gr_str))

    #group by passenger group and count number of people in a group
    gr_tmp = df.groupby('Passenger_group')['HomePlanet'].agg(['count'])

    #assign each passenger the number of people in his group
    df['Passenger_group_size'] = df['Passenger_group'].apply(lambda x: gr_tmp.loc[x])   

    #drop stuff
    df.drop(['Cabin'], axis = 1, inplace = True)
    df.drop(['PassengerId'], axis = 1, inplace = True)
    df.drop(['Passenger_group'], axis = 1, inplace = True)
    df.drop(['num'], axis = 1, inplace = True) #I assume the number of the room does not matter. It has ~1700 unique values out of the 6600 entries
    df.drop(['Name'], axis = 1, inplace = True) # and name also!
    df['side']=df['side'].apply(lambda x: int(x=='P')) #side would be 1 if it is P (portside)    
     
    
    df_numeric = pd.get_dummies(df, columns = ['HomePlanet', 'Destination', 'deck'])
        
        
    return df, df_numeric

df, df_numeric = preprocess_df(df_orig)

df_numeric.head(5)

DROPPING NANs


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,side,...,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,...,0,1,0,1,0,0,0,0,0,0
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1,0,...,0,1,0,0,0,0,0,1,0,0
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0,0,...,0,1,1,0,0,0,0,0,0,0
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0,0,...,0,1,1,0,0,0,0,0,0,0
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1,0,...,0,1,0,0,0,0,0,1,0,0


# Exploratory data analysis

Note that the `CryoSleep` has quite impact on the teleportation rate.

In [3]:
df.groupby(['CryoSleep', 'HomePlanet'])['Transported'].agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
CryoSleep,HomePlanet,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,Earth,0.323102,2476
0.0,Europa,0.408851,949
0.0,Mars,0.275618,849
1.0,Earth,0.658716,1090
1.0,Europa,0.98895,724
1.0,Mars,0.909266,518


Print top correlations with `transported`:

In [4]:
corr = df_numeric.corr()
#fig, ax = plt.subplots(figsize=(18, 18))
#sns.heatmap(corr, ax=ax, annot=True)
#plt.show()

corr['Transported'].apply(lambda x: np.abs(x)).sort_values(ascending = False)

Transported                  1.000000
CryoSleep                    0.462803
RoomService                  0.247291
Spa                          0.219854
VRDeck                       0.207950
HomePlanet_Europa            0.182004
HomePlanet_Earth             0.168845
deck_B                       0.146288
Destination_55 Cancri e      0.123783
Destination_TRAPPIST-1e      0.110655
deck_C                       0.109988
side                         0.106186
deck_E                       0.098427
deck_F                       0.094847
Age                          0.082553
Passenger_group_size         0.082088
FoodCourt                    0.055025
VIP                          0.042260
deck_D                       0.039772
deck_G                       0.022711
HomePlanet_Mars              0.012357
ShoppingMall                 0.011602
deck_A                       0.005651
Destination_PSO J318.5-22    0.001281
deck_T                       0.000126
Name: Transported, dtype: float64

In [5]:
#import phik
#fig, ax = plt.subplots(figsize=(18, 18))
#sns.heatmap(df.phik_matrix(),  ax=ax, annot=True)
#plt.show()
#df.phik_matrix()['Transported'].apply(lambda x: np.abs(x)).sort_values(ascending = False)
#from pandas_profiling import ProfileReport
#train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
#profile = ProfileReport(train_df, title="Pandas Profiling Report")
#profile

***

# Data preprocessing for models

In [6]:
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score



In [7]:
df_numeric.columns

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'Transported', 'side', 'Passenger_group_size',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'deck_A', 'deck_B', 'deck_C', 'deck_D',
       'deck_E', 'deck_F', 'deck_G', 'deck_T'],
      dtype='object')

Do NOT scale categorical columns. Scale only numerical:

In [8]:

def scale_data(df_numeric):
    cols_to_scale = ['Age','RoomService', 'FoodCourt','ShoppingMall', 'Spa', 'VRDeck', 'Passenger_group_size']

    scaler = sklearn.preprocessing.StandardScaler()
    df_numeric_scaled = copy(df_numeric)
    scaler.fit(df_numeric[cols_to_scale])
    df_numeric_scaled[cols_to_scale] = scaler.transform(df_numeric[cols_to_scale])
    return df_numeric_scaled, scaler, cols_to_scale

df_numeric_scaled, scaler, cols_to_scale = scale_data(df_numeric)

In [9]:
# test/train(/cv) split
X = df_numeric_scaled.loc[:, df_numeric_scaled.columns != 'Transported']
y = df_numeric_scaled['Transported']


X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y, test_size = 0.3, random_state = 42)
print(X_train.shape, y_train.shape,X_test.shape,y_test.shape)

(4624, 24) (4624,) (1982, 24) (1982,)


# Model 1: k-nearest neighbors classifier

In [10]:
from sklearn.neighbors import KNeighborsClassifier
param_grid = {'n_neighbors': np.arange(1,50,2)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv = 3, verbose = 1)
knn_cv.fit(X_train,y_train)

print('best par knn:', knn_cv.best_params_)
print('best score knn:', knn_cv.best_score_)

y_pred_knn = knn_cv.predict(X_test)
score_knn = knn_cv.score(X_test,y_test)
print(f'knn  score: {score_knn}')

Fitting 3 folds for each of 25 candidates, totalling 75 fits
best par knn: {'n_neighbors': 21}
best score knn: 0.7720583346168834
knn  score: 0.781029263370333


# Model 2: logistic regression

In [11]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(penalty = 'l2',  max_iter = 500)
logreg_cv = GridSearchCV(logreg, {'C': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]}, cv = 5, verbose = 1)
logreg_cv.fit(X_train, y_train)

print('best par logreg:', logreg_cv.best_params_)
print('best score logreg:', logreg_cv.best_score_)

score_logreg = logreg_cv.score(X_test,y_test)
print(f"logreg score: {score_logreg}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
best par logreg: {'C': 1.0}
best score logreg: 0.7908737568737567
logreg score: 0.8012108980827447


# Model 3: Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = [250,  500] # number of trees in the random forest
max_features = ['auto'] # number of features in consideration at every split
max_depth = [5, 10, 20] # maximum number of levels allowed in each decision tree
min_samples_split = [5] # minimum sample number to split a node
min_samples_leaf = [3, 6] # minimum sample number that can be stored in a leaf node
bootstrap = [True] # method used to sample data points

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier(random_state=42)

rf_cv = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                               n_iter=120, cv=3, verbose=1, random_state=35, n_jobs=-1)

rf_cv.fit(X_train, y_train)


print('best par rf:', rf_cv.best_params_)
print('best score rf:', rf_cv.best_score_)

score_rf = rf_cv.score(X_test,y_test)
print(f"rf score: {score_rf}")

Fitting 3 folds for each of 12 candidates, totalling 36 fits




best par rf: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'auto', 'max_depth': 20, 'bootstrap': True}
best score rf: 0.7917398570784492
rf score: 0.8193743693239153


# Model 4: SVM

In [15]:
from sklearn.svm import SVC
param_grid = {'C': [1e0,1e1,1e2, 1e3],
              #'gamma': [0.1, 0.01, 0.001, 0.0001]}
              'gamma': ['auto', 'scale']}
 
svm_cv = GridSearchCV(SVC(), param_grid, refit=True, verbose=1)
 
svm_cv.fit(X_train, y_train)



print('best par svm:', svm_cv.best_params_)
print('best score svm:', svm_cv.best_score_)


score_svm = svm_cv.score(X_test,y_test)
print(f"svm score: {score_svm}")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
best par svm: {'C': 10.0, 'gamma': 'auto'}
best score svm: 0.7986601146601146
svm score: 0.8188698284561049


# Model 5: Multi-layer perceptrons (artificial neural network)

In [17]:
from sklearn.neural_network import MLPClassifier

param_grid = {'alpha': [1e-2, 1e-1, 1e0,1e1,1e2,],
             'hidden_layer_sizes': [(10,3), (20,3), (10,5), (20,5)],
             'learning_rate': ['adaptive', 'constant']}
 

nn = MLPClassifier(max_iter = 500)

nn_cv = RandomizedSearchCV(estimator=nn, param_distributions=param_grid, cv=3, verbose=1, random_state=35, n_jobs=-1)

nn_cv.fit(X_train, y_train)



print('best par nn:', nn_cv.best_params_)
print('best score nn:', nn_cv.best_score_)

score_nn = nn_cv.score(X_test,y_test)
print(f"nn score: {score_nn}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits




best par nn: {'learning_rate': 'adaptive', 'hidden_layer_sizes': (20, 3), 'alpha': 0.1}
best score nn: 0.7964975494713878
nn score: 0.8118062563067608


# Compile results

In [18]:
print(f"score_knn={score_knn:.3f}")
print(f"score_logreg={score_logreg:.3f}")
print(f"score_rf={score_rf:.3f}")
print(f"score_svm={score_svm:.3f}")
print(f"score_nn={score_nn:.3f}")

score_knn=0.781
score_logreg=0.801
score_rf=0.819
score_svm=0.819
score_nn=0.812


# Submit the best model

I opt here the Random Forest classifier we trained above.

In [19]:
!head sample_submission.csv

PassengerId,Transported
0013_01,False
0018_01,False
0019_01,False
0021_01,False
0023_01,False
0027_01,False
0029_01,False
0032_01,False
0032_02,False


In [20]:
df_test_orig = pd.read_csv('test.csv')

df_test,df_test_numeric = preprocess_df(df_test_orig, dropna = False)

df_test_numeric_scaled = copy(df_test_numeric)
df_test_numeric_scaled[cols_to_scale] = scaler.transform(df_test_numeric_scaled[cols_to_scale])
df_test_numeric_scaled.head()


test_prediction = rf_cv.predict(df_test_numeric_scaled)
df_test_numeric_scaled['Transported'] = test_prediction
my_submission = pd.DataFrame({'PassengerId': df_test_orig['PassengerId'], 'Transported': test_prediction.astype(bool)})
my_submission.to_csv('submission.csv', index=False)

FILLING NANs with median


  df_orig = df_orig.fillna(df_orig.median())
