## Packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

## Visualization

This sections aims to provide insights into the data and the problem we are dealing with

In [None]:
# load data
spaceship_df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
# Check how data is organized
spaceship_df.info()

There is a mix between the types, we will check the general informations about the data and count the number of missing values. Initially we will just drop this data.

In [None]:
spaceship_df.describe()

In [None]:
# Correlation between numerical columns
spaceship_df.corr()

In [None]:
spaceship_df.head(5)

In [None]:
# drop missing values and check the difference
non_nan_spaceship_df = spaceship_df.dropna()
print(len(spaceship_df)-len(non_nan_spaceship_df))

Despite the number of missing values be small, this is number is correspodent to multiple observations, removing a lot of data to we train. We will make some improvements later

In [None]:
# Percentual of removed data
print(100*(len(spaceship_df)-len(non_nan_spaceship_df))/len(spaceship_df))

Some numerical columns demonstrated be relationed with each other, but the categorical data we will have a closer looking.

Home Planet

In [None]:
sns.catplot(x="HomePlanet", col="Transported", kind="count", ci=None, data=non_nan_spaceship_df);

CryoSleep

In [None]:
sns.catplot(x="CryoSleep", col="Transported", kind="count", ci=None, data=non_nan_spaceship_df);

Destination

In [None]:
sns.catplot(x="Destination", col="Transported", kind="count", ci=None, data=non_nan_spaceship_df);

VIP

In [None]:
sns.catplot(x="VIP", col="Transported", kind="count", ci=None, data=non_nan_spaceship_df);

Relationship with the Home Planet

In [None]:
sns.catplot(x="VIP", col="HomePlanet", kind="count", ci=None, data=non_nan_spaceship_df);

The Cabin column is composed by deck, num and side. We will explore these attributes individually

In [None]:
def get_cabin_deck(cabin):
    if type(cabin) is str:
        return cabin.split('/')[0]
    else:
        return np.nan
    
def get_cabin_num(cabin):
    if type(cabin) is str:
        return cabin.split('/')[1]
    else:
        return np.nan
    
def get_cabin_side(cabin):
    if type(cabin) is str:
        return cabin.split('/')[2]
    else:
        return np.nan

cabin_deck = non_nan_spaceship_df.Cabin.map(get_cabin_deck)
cabin_num = non_nan_spaceship_df.Cabin.map(get_cabin_num)
cabin_side = non_nan_spaceship_df.Cabin.map(get_cabin_side)
del non_nan_spaceship_df['Cabin']
non_nan_spaceship_df.insert(len(non_nan_spaceship_df.columns) - 1, 'CabinDeck', cabin_deck)
non_nan_spaceship_df.insert(len(non_nan_spaceship_df.columns) - 1, 'CabinNum', cabin_num)
non_nan_spaceship_df.insert(len(non_nan_spaceship_df.columns) - 1, 'CabinSide', cabin_side)
non_nan_spaceship_df.head()

Possible values for a Cabin number are infeasible to plot

In [None]:
unique_cabin_num = np.unique(non_nan_spaceship_df['CabinNum'].values, return_counts=True)
print(unique_cabin_num, len(unique_cabin_num[0]))

In [None]:
sns.catplot(x="CabinDeck", col="Transported", kind="count", ci=None, data=non_nan_spaceship_df);

In [None]:
sns.catplot(x="CabinSide", col="Transported", kind="count", ci=None, data=non_nan_spaceship_df);

Besides that, a possibility would be a Last name for a person, but we checked that the unique values is to huge for preprocessing for the Machine Learning algorithms. We will ignore this attribute for now

In [None]:
def get_last_name(name):
    if type(name) is str:
        return name.split()[1]
    else:
        return np.nan
    
def get_first_name(name):
    if type(name) is str:
        return name.split()[0]
    else:
        return np.nan

first_name = non_nan_spaceship_df.Name.map(get_first_name)
last_name = non_nan_spaceship_df.Name.map(get_last_name)
del non_nan_spaceship_df['Name']
non_nan_spaceship_df.insert(len(non_nan_spaceship_df.columns) - 1, 'FirstName', first_name)
non_nan_spaceship_df.insert(len(non_nan_spaceship_df.columns) - 1, 'LastName', last_name)
non_nan_spaceship_df.head()

In [None]:
# sns.catplot(x="LastName", col="Transported", kind="count", ci=None, data=non_nan_spaceship_df);

In [None]:
unique_last_name = np.unique(non_nan_spaceship_df['LastName'].values, return_counts=True)
print(unique_last_name, len(unique_last_name[0]))

The PassengerId contains the group's number for each person. It is a possibility of significant attribute too

In [None]:
def get_group(id):
    if type(id) is str:
        return id.split('_')[0]
    else:
        return np.nan

group = non_nan_spaceship_df.PassengerId.map(get_group)
del non_nan_spaceship_df['PassengerId']
non_nan_spaceship_df.insert(0, 'IdGroup', group)
non_nan_spaceship_df.head()

Same situation as LastName

In [None]:
unique_group = np.unique(non_nan_spaceship_df['IdGroup'].values, return_counts=True)
print(unique_group, len(unique_group[0]))

## Preprocessing

This sections is divided according to the ideas used to the models Random Forest (RF) and Multi Layer Perceptron (MLP)

### RF

3 largest correlations with Transported

In [None]:
y = non_nan_spaceship_df["Transported"].values
x = np.concatenate((non_nan_spaceship_df["Spa"].values.reshape((len(non_nan_spaceship_df["Spa"].values), 1)),
                    non_nan_spaceship_df["VRDeck"].values.reshape((len(non_nan_spaceship_df["VRDeck"].values), 1)),
                    non_nan_spaceship_df["RoomService"].values.reshape((len(non_nan_spaceship_df["RoomService"].values), 1))
                   ), axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=32764)

All numeric columns

In [None]:
x = non_nan_spaceship_df[["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]]
y = non_nan_spaceship_df["Transported"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=32764)

Most significative features and categoric columns

In [None]:
# get numeric columns
x = np.concatenate((non_nan_spaceship_df["Age"].values.reshape((len(non_nan_spaceship_df["Age"].values), 1, 1)),
                    non_nan_spaceship_df["RoomService"].values.reshape((len(non_nan_spaceship_df["RoomService"].values), 1, 1)),
                    non_nan_spaceship_df["FoodCourt"].values.reshape((len(non_nan_spaceship_df["FoodCourt"].values), 1, 1)),
                    non_nan_spaceship_df["ShoppingMall"].values.reshape((len(non_nan_spaceship_df["ShoppingMall"].values), 1, 1)),
                    non_nan_spaceship_df["Spa"].values.reshape((len(non_nan_spaceship_df["Spa"].values), 1, 1)),
                    non_nan_spaceship_df["VRDeck"].values.reshape((len(non_nan_spaceship_df["VRDeck"].values), 1, 1))
                   ), axis=1)

y = non_nan_spaceship_df["Transported"].values

# Cryosleep and home planet
x_categorical = np.concatenate((non_nan_spaceship_df["CryoSleep"].values.reshape((len(non_nan_spaceship_df["CryoSleep"]), 1)),
                               non_nan_spaceship_df["HomePlanet"].values.reshape((len(non_nan_spaceship_df["HomePlanet"]), 1))),
                               axis=1)

x_categorical[:, 0] = np.where(x_categorical[:, 0] == True, 2, 0)
x_categorical[:, 1] = np.where(x_categorical[:, 1] == 'Earth', 3,
                               np.where(x_categorical[:, 1] == 'Europa', 2, 1))

x = np.concatenate((x.reshape((x.shape[0], 6)), x_categorical), axis=1)
x = x.astype(np.float16)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=32764)
print(x)

Including IdGroup

In [None]:
# get numeric columns
x = np.concatenate((non_nan_spaceship_df["IdGroup"].values.reshape((len(non_nan_spaceship_df["IdGroup"].values), 1, 1)),
                    non_nan_spaceship_df["Age"].values.reshape((len(non_nan_spaceship_df["Age"].values), 1, 1)),
                    non_nan_spaceship_df["RoomService"].values.reshape((len(non_nan_spaceship_df["RoomService"].values), 1, 1)),
                    non_nan_spaceship_df["FoodCourt"].values.reshape((len(non_nan_spaceship_df["FoodCourt"].values), 1, 1)),
                    non_nan_spaceship_df["ShoppingMall"].values.reshape((len(non_nan_spaceship_df["ShoppingMall"].values), 1, 1)),
                    non_nan_spaceship_df["Spa"].values.reshape((len(non_nan_spaceship_df["Spa"].values), 1, 1)),
                    non_nan_spaceship_df["VRDeck"].values.reshape((len(non_nan_spaceship_df["VRDeck"].values), 1, 1))
                   ), axis=1)

y = non_nan_spaceship_df["Transported"].values

# Cryosleep and home planet
x_categorical = np.concatenate((non_nan_spaceship_df["CryoSleep"].values.reshape((len(non_nan_spaceship_df["CryoSleep"]), 1)),
                               non_nan_spaceship_df["HomePlanet"].values.reshape((len(non_nan_spaceship_df["HomePlanet"]), 1))),
                               axis=1)

x_categorical[:, 0] = np.where(x_categorical[:, 0] == True, 2, 0)
x_categorical[:, 1] = np.where(x_categorical[:, 1] == 'Earth', 3,
                               np.where(x_categorical[:, 1] == 'Europa', 2, 1))

x = np.concatenate((x.reshape((x.shape[0], 7)), x_categorical), axis=1)
x = x.astype(np.float16)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=32764)
print(x)

In [None]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

### MLP

3 largest correlations with Transported

In [None]:
x = np.concatenate((non_nan_spaceship_df["RoomService"].values.reshape((len(non_nan_spaceship_df["RoomService"].values), 1, 1)),
                    non_nan_spaceship_df["Spa"].values.reshape((len(non_nan_spaceship_df["Spa"].values), 1, 1)),
                    non_nan_spaceship_df["VRDeck"].values.reshape((len(non_nan_spaceship_df["VRDeck"].values), 1, 1))
                   ), axis=1)

y = non_nan_spaceship_df["Transported"].values

room_normalizer = Normalizer().fit(x[:, 0])
spa_normalizer = Normalizer().fit(x[:, 1])
vr_normalizer = Normalizer().fit(x[:, 2])

x[:, 0] = room_normalizer.transform(x[:, 0])
x[:, 1] = spa_normalizer.transform(x[:, 1])
x[:, 2] = vr_normalizer.transform(x[:, 2])

x = x.reshape((x.shape[0], 3))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=32764)

All numeric columns

In [None]:
x = np.concatenate((non_nan_spaceship_df["Age"].values.reshape((len(non_nan_spaceship_df["Age"].values), 1, 1)),
                    non_nan_spaceship_df["RoomService"].values.reshape((len(non_nan_spaceship_df["RoomService"].values), 1, 1)),
                    non_nan_spaceship_df["FoodCourt"].values.reshape((len(non_nan_spaceship_df["FoodCourt"].values), 1, 1)),
                    non_nan_spaceship_df["ShoppingMall"].values.reshape((len(non_nan_spaceship_df["ShoppingMall"].values), 1, 1)),
                    non_nan_spaceship_df["Spa"].values.reshape((len(non_nan_spaceship_df["Spa"].values), 1, 1)),
                    non_nan_spaceship_df["VRDeck"].values.reshape((len(non_nan_spaceship_df["VRDeck"].values), 1, 1))
                   ), axis=1)

y = non_nan_spaceship_df["Transported"].values

age_normalizer = Normalizer().fit(x[:, 0])
room_normalizer = Normalizer().fit(x[:, 1])
food_normalizer = Normalizer().fit(x[:, 2])
shopping_normalizer = Normalizer().fit(x[:, 3])
spa_normalizer = Normalizer().fit(x[:, 4])
vr_normalizer = Normalizer().fit(x[:, 5])

x[:, 0] = age_normalizer.transform(x[:, 0])
x[:, 1] = room_normalizer.transform(x[:, 1])
x[:, 2] = food_normalizer.transform(x[:, 2])
x[:, 3] = shopping_normalizer.transform(x[:, 3])
x[:, 4] = spa_normalizer.transform(x[:, 4])
x[:, 5] = vr_normalizer.transform(x[:, 5])

x = x.reshape((x.shape[0], 6))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=32764)

Most significative features and categoric columns

In [None]:
# get numeric columns
x = np.concatenate((non_nan_spaceship_df["RoomService"].values.reshape((len(non_nan_spaceship_df["RoomService"].values), 1, 1)),
                    non_nan_spaceship_df["Spa"].values.reshape((len(non_nan_spaceship_df["Spa"].values), 1, 1)),
                    non_nan_spaceship_df["VRDeck"].values.reshape((len(non_nan_spaceship_df["VRDeck"].values), 1, 1))
                   ), axis=1)

y = non_nan_spaceship_df["Transported"].values

room_normalizer = Normalizer().fit(x[:, 0])
spa_normalizer = Normalizer().fit(x[:, 1])
vr_normalizer = Normalizer().fit(x[:, 2])

x[:, 0] = room_normalizer.transform(x[:, 0])
x[:, 1] = spa_normalizer.transform(x[:, 1])
x[:, 2] = vr_normalizer.transform(x[:, 2])

# Cryosleep and home planet
x_categorical = np.concatenate((non_nan_spaceship_df["CryoSleep"].values.reshape((len(non_nan_spaceship_df["CryoSleep"]), 1)),
                               non_nan_spaceship_df["HomePlanet"].values.reshape((len(non_nan_spaceship_df["HomePlanet"]), 1))),
                               axis=1)

x_categorical[:, 0] = np.where(x_categorical[:, 0] == True, 2, 0)
x_categorical[:, 1] = np.where(x_categorical[:, 1] == 'Earth', 3,
                               np.where(x_categorical[:, 1] == 'Europa', 2, 1))

x = np.concatenate((x.reshape((x.shape[0], 3)), x_categorical), axis=1)
x = x.astype(np.float16)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=32764)
print(x)

Inclusion of IdGroup

In [None]:
# get numeric columns
x = np.concatenate((non_nan_spaceship_df["IdGroup"].values.reshape((len(non_nan_spaceship_df["IdGroup"].values), 1, 1)),
                    non_nan_spaceship_df["RoomService"].values.reshape((len(non_nan_spaceship_df["RoomService"].values), 1, 1)),
                    non_nan_spaceship_df["Spa"].values.reshape((len(non_nan_spaceship_df["Spa"].values), 1, 1)),
                    non_nan_spaceship_df["VRDeck"].values.reshape((len(non_nan_spaceship_df["VRDeck"].values), 1, 1))
                   ), axis=1)

y = non_nan_spaceship_df["Transported"].values

group_normalizer = Normalizer().fit(x[:, 0])
room_normalizer = Normalizer().fit(x[:, 1])
spa_normalizer = Normalizer().fit(x[:, 2])
vr_normalizer = Normalizer().fit(x[:, 3])

x[:, 0] = group_normalizer.transform(x[:, 0])
x[:, 1] = room_normalizer.transform(x[:, 1])
x[:, 2] = spa_normalizer.transform(x[:, 2])
x[:, 3] = vr_normalizer.transform(x[:, 3])

# Cryosleep and home planet
x_categorical = np.concatenate((non_nan_spaceship_df["CryoSleep"].values.reshape((len(non_nan_spaceship_df["CryoSleep"]), 1)),
                               non_nan_spaceship_df["HomePlanet"].values.reshape((len(non_nan_spaceship_df["HomePlanet"]), 1))),
                               axis=1)


x_categorical[:, 0] = np.where(x_categorical[:, 0] == True, 2, 0)
x_categorical[:, 1] = np.where(x_categorical[:, 1] == 'Earth', 3,
                               np.where(x_categorical[:, 1] == 'Europa', 2, 1))

x = np.concatenate((x.reshape((x.shape[0], 4)), x_categorical), axis=1)
x = x.astype(np.float16)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=32764)
print(x)

In [None]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

## Model

This section trains the models and provides the performance over the test set for each algorithm (the choosen)

RF

In [None]:
# Reproducibility
rf = RandomForestClassifier(random_state=32764)

rf.fit(x_train, y_train)

In [None]:
print(classification_report(y_test, rf.predict(x_test)))

MLP

In [None]:
# Reproducibility
mlp = MLPClassifier(random_state=32764)

mlp.fit(x_train, y_train)

In [None]:
print(classification_report(y_test, mlp.predict(x_test)))

## Submission

This section demonstrate how the data of submission was preprocessed according to each proposal

### RF

3 largest correlations

In [None]:
import gc
gc.collect()

test_df = pd.read_csv("../input/spaceship-titanic/test.csv")
x = test_df[["RoomService", "Spa", "VRDeck"]]

x["RoomService"].fillna(x["RoomService"].mean(skipna=True), inplace=True)
x["Spa"].fillna(x["Spa"].mean(skipna=True), inplace=True)
x["VRDeck"].fillna(x["VRDeck"].mean(skipna=True), inplace=True)

x = np.concatenate((x["Spa"].values.reshape((len(x["Spa"].values), 1)),
                    x["VRDeck"].values.reshape((len(x["VRDeck"].values), 1)),
                    x["RoomService"].values.reshape((len(x["RoomService"].values), 1))
                   ), axis=1)

y = rf.predict(x)
submission_df = pd.DataFrame({"PassengerId": test_df.PassengerId, "Transported": y})
submission_df.to_csv('submission.csv', index=False)

All numeric columns, replacing missing values for the mean

In [None]:
import gc
gc.collect()

test_df = pd.read_csv("../input/spaceship-titanic/test.csv")
x = test_df[["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]]

x["Age"].fillna(x["Age"].mean(skipna=True), inplace=True)
x["RoomService"].fillna(x["RoomService"].mean(skipna=True), inplace=True)
x["FoodCourt"].fillna(x["FoodCourt"].mean(skipna=True), inplace=True)
x["ShoppingMall"].fillna(x["ShoppingMall"].mean(skipna=True), inplace=True)
x["Spa"].fillna(x["Spa"].mean(skipna=True), inplace=True)
x["VRDeck"].fillna(x["VRDeck"].mean(skipna=True), inplace=True)

y = rf.predict(x)
submission_df = pd.DataFrame({"PassengerId": test_df.PassengerId, "Transported": y})
submission_df.to_csv('submission.csv', index=False)

All numeric columns and categorical columns

In [None]:
import gc
gc.collect()

test_df = pd.read_csv("../input/spaceship-titanic/test.csv")
x = test_df[["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "CryoSleep", "HomePlanet"]]

x["Age"].fillna(x["Age"].mean(skipna=True), inplace=True)
x["RoomService"].fillna(x["RoomService"].mean(skipna=True), inplace=True)
x["FoodCourt"].fillna(x["FoodCourt"].mean(skipna=True), inplace=True)
x["ShoppingMall"].fillna(x["ShoppingMall"].mean(skipna=True), inplace=True)
x["Spa"].fillna(x["Spa"].mean(skipna=True), inplace=True)
x["VRDeck"].fillna(x["VRDeck"].mean(skipna=True), inplace=True)

x["CryoSleep"] = np.where(x["CryoSleep"].values == True, 2, np.where(x["CryoSleep"].values == False, 0, 1))
x["HomePlanet"] = np.where(x["HomePlanet"].values == 'Earth', 3,
                               np.where(x["HomePlanet"].values == 'Europa', 2,
                                       np.where(x["HomePlanet"].values == 'Mars', 1, 0)))

x = np.concatenate((x["Age"].values.reshape((len(x["Age"].values), 1, 1)),
                    x["RoomService"].values.reshape((len(x["RoomService"].values), 1, 1)),
                    x["FoodCourt"].values.reshape((len(x["FoodCourt"].values), 1, 1)),
                    x["ShoppingMall"].values.reshape((len(x["ShoppingMall"].values), 1, 1)),
                    x["Spa"].values.reshape((len(x["Spa"].values), 1, 1)),
                    x["VRDeck"].values.reshape((len(x["VRDeck"].values), 1, 1)),
                    x["CryoSleep"].values.reshape((len(x["CryoSleep"].values), 1, 1)),
                    x["HomePlanet"].values.reshape((len(x["HomePlanet"].values), 1, 1))
                   ), axis=1)


# reshape
x = x.reshape((x.shape[0], 8)).astype(np.float16)

y = rf.predict(x)
submission_df = pd.DataFrame({"PassengerId": test_df.PassengerId, "Transported": y})
submission_df.to_csv('submission.csv', index=False)

Inclusion of IdGroup

In [None]:
import gc
gc.collect()

test_df = pd.read_csv("../input/spaceship-titanic/test.csv")
x = test_df[["PassengerId", "Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "CryoSleep", "HomePlanet"]]

group = x.PassengerId.map(get_group)
del x['PassengerId']
x.insert(0, 'IdGroup', group)

x["Age"].fillna(x["Age"].mean(skipna=True), inplace=True)
x["RoomService"].fillna(x["RoomService"].mean(skipna=True), inplace=True)
x["FoodCourt"].fillna(x["FoodCourt"].mean(skipna=True), inplace=True)
x["ShoppingMall"].fillna(x["ShoppingMall"].mean(skipna=True), inplace=True)
x["Spa"].fillna(x["Spa"].mean(skipna=True), inplace=True)
x["VRDeck"].fillna(x["VRDeck"].mean(skipna=True), inplace=True)

x["CryoSleep"] = np.where(x["CryoSleep"].values == True, 2, np.where(x["CryoSleep"].values == False, 0, 1))
x["HomePlanet"] = np.where(x["HomePlanet"].values == 'Earth', 3,
                               np.where(x["HomePlanet"].values == 'Europa', 2,
                                       np.where(x["HomePlanet"].values == 'Mars', 1, 0)))

x = np.concatenate((x["IdGroup"].values.reshape((len(x["IdGroup"].values), 1, 1)),
                    x["Age"].values.reshape((len(x["Age"].values), 1, 1)),
                    x["RoomService"].values.reshape((len(x["RoomService"].values), 1, 1)),
                    x["FoodCourt"].values.reshape((len(x["FoodCourt"].values), 1, 1)),
                    x["ShoppingMall"].values.reshape((len(x["ShoppingMall"].values), 1, 1)),
                    x["Spa"].values.reshape((len(x["Spa"].values), 1, 1)),
                    x["VRDeck"].values.reshape((len(x["VRDeck"].values), 1, 1)),
                    x["CryoSleep"].values.reshape((len(x["CryoSleep"].values), 1, 1)),
                    x["HomePlanet"].values.reshape((len(x["HomePlanet"].values), 1, 1))
                   ), axis=1)


# reshape
x = x.reshape((x.shape[0], 9)).astype(np.float16)

y = rf.predict(x)
submission_df = pd.DataFrame({"PassengerId": test_df.PassengerId, "Transported": y})
submission_df.to_csv('submission.csv', index=False)

### MLP

3 largest correlations

In [None]:
import gc
gc.collect()

test_df = pd.read_csv("../input/spaceship-titanic/test.csv")
x = test_df[["RoomService", "Spa", "VRDeck"]]

x["RoomService"].fillna(x["RoomService"].mean(skipna=True), inplace=True)
x["Spa"].fillna(x["Spa"].mean(skipna=True), inplace=True)
x["VRDeck"].fillna(x["VRDeck"].mean(skipna=True), inplace=True)

x = np.concatenate((x["RoomService"].values.reshape((len(x["RoomService"].values), 1, 1)),
                    x["Spa"].values.reshape((len(x["Spa"].values), 1, 1)),
                    x["VRDeck"].values.reshape((len(x["VRDeck"].values), 1, 1))
                   ), axis=1)

# Normalize the features
x[:, 0] = room_normalizer.transform(x[:, 0])
x[:, 1] = spa_normalizer.transform(x[:, 1])
x[:, 2] = vr_normalizer.transform(x[:, 2])

# reshape
x = x.reshape((x.shape[0], 3))

y = mlp.predict(x)
submission_df = pd.DataFrame({"PassengerId": test_df.PassengerId, "Transported": y})
submission_df.to_csv('submission.csv', index=False)

All numeric columns

In [None]:
import gc
gc.collect()

test_df = pd.read_csv("../input/spaceship-titanic/test.csv")
x = test_df[["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]]

x["Age"].fillna(x["Age"].mean(skipna=True), inplace=True)
x["RoomService"].fillna(x["RoomService"].mean(skipna=True), inplace=True)
x["FoodCourt"].fillna(x["FoodCourt"].mean(skipna=True), inplace=True)
x["ShoppingMall"].fillna(x["ShoppingMall"].mean(skipna=True), inplace=True)
x["Spa"].fillna(x["Spa"].mean(skipna=True), inplace=True)
x["VRDeck"].fillna(x["VRDeck"].mean(skipna=True), inplace=True)

x = np.concatenate((x["Age"].values.reshape((len(x["Age"].values), 1, 1)),
                    x["RoomService"].values.reshape((len(x["RoomService"].values), 1, 1)),
                    x["FoodCourt"].values.reshape((len(x["FoodCourt"].values), 1, 1)),
                    x["ShoppingMall"].values.reshape((len(x["ShoppingMall"].values), 1, 1)),
                    x["Spa"].values.reshape((len(x["Spa"].values), 1, 1)),
                    x["VRDeck"].values.reshape((len(x["VRDeck"].values), 1, 1))
                   ), axis=1)

# Normalize the features
x[:, 0] = age_normalizer.transform(x[:, 0])
x[:, 1] = room_normalizer.transform(x[:, 1])
x[:, 2] = food_normalizer.transform(x[:, 2])
x[:, 3] = shopping_normalizer.transform(x[:, 3])
x[:, 4] = spa_normalizer.transform(x[:, 4])
x[:, 5] = vr_normalizer.transform(x[:, 5])

# reshape
x = x.reshape((x.shape[0], 6))

y = mlp.predict(x)
submission_df = pd.DataFrame({"PassengerId": test_df.PassengerId, "Transported": y})
submission_df.to_csv('submission.csv', index=False)

3 largest correlations and categorical columns

In [None]:
import gc
gc.collect()

test_df = pd.read_csv("../input/spaceship-titanic/test.csv")
x = test_df[["RoomService", "Spa", "VRDeck", "CryoSleep", "HomePlanet"]]

x["RoomService"].fillna(x["RoomService"].mean(skipna=True), inplace=True)
x["Spa"].fillna(x["Spa"].mean(skipna=True), inplace=True)
x["VRDeck"].fillna(x["VRDeck"].mean(skipna=True), inplace=True)

x["CryoSleep"] = np.where(x["CryoSleep"].values == True, 2, np.where(x["CryoSleep"].values == False, 0, 1))
x["HomePlanet"] = np.where(x["HomePlanet"].values == 'Earth', 3,
                               np.where(x["HomePlanet"].values == 'Europa', 2,
                                       np.where(x["HomePlanet"].values == 'Mars', 1, 0)))

x = np.concatenate((x["RoomService"].values.reshape((len(x["RoomService"].values), 1, 1)),
                    x["Spa"].values.reshape((len(x["Spa"].values), 1, 1)),
                    x["VRDeck"].values.reshape((len(x["VRDeck"].values), 1, 1)),
                    x["CryoSleep"].values.reshape((len(x["CryoSleep"].values), 1, 1)),
                    x["HomePlanet"].values.reshape((len(x["HomePlanet"].values), 1, 1))
                   ), axis=1)

# Normalize the features
x[:, 0] = room_normalizer.transform(x[:, 0])
x[:, 1] = spa_normalizer.transform(x[:, 1])
x[:, 2] = vr_normalizer.transform(x[:, 2])

# reshape
x = x.reshape((x.shape[0], 5)).astype(np.float16)

y = mlp.predict(x)
submission_df = pd.DataFrame({"PassengerId": test_df.PassengerId, "Transported": y})
submission_df.to_csv('submission.csv', index=False)

Including the IdGroup

In [None]:
import gc
gc.collect()

test_df = pd.read_csv("../input/spaceship-titanic/test.csv")
x = test_df[["PassengerId", "RoomService", "Spa", "VRDeck", "CryoSleep", "HomePlanet"]]

group = x.PassengerId.map(get_group)
del x['PassengerId']
x.insert(0, 'IdGroup', group)

x["RoomService"].fillna(x["RoomService"].mean(skipna=True), inplace=True)
x["Spa"].fillna(x["Spa"].mean(skipna=True), inplace=True)
x["VRDeck"].fillna(x["VRDeck"].mean(skipna=True), inplace=True)

x["CryoSleep"] = np.where(x["CryoSleep"].values == True, 2, np.where(x["CryoSleep"].values == False, 0, 1))
x["HomePlanet"] = np.where(x["HomePlanet"].values == 'Earth', 3,
                               np.where(x["HomePlanet"].values == 'Europa', 2,
                                       np.where(x["HomePlanet"].values == 'Mars', 1, 0)))

x = np.concatenate((x["IdGroup"].values.reshape((len(x["IdGroup"].values), 1, 1)),
                    x["RoomService"].values.reshape((len(x["RoomService"].values), 1, 1)),
                    x["Spa"].values.reshape((len(x["Spa"].values), 1, 1)),
                    x["VRDeck"].values.reshape((len(x["VRDeck"].values), 1, 1)),
                    x["CryoSleep"].values.reshape((len(x["CryoSleep"].values), 1, 1)),
                    x["HomePlanet"].values.reshape((len(x["HomePlanet"].values), 1, 1))
                   ), axis=1)

# Normalize the features
x[:, 0] = group_normalizer.transform(x[:, 0])
x[:, 1] = room_normalizer.transform(x[:, 1])
x[:, 2] = spa_normalizer.transform(x[:, 2])
x[:, 3] = vr_normalizer.transform(x[:, 3])

# reshape
x = x.reshape((x.shape[0], 6)).astype(np.float16)

y = mlp.predict(x)
submission_df = pd.DataFrame({"PassengerId": test_df.PassengerId, "Transported": y})
submission_df.to_csv('submission.csv', index=False)