In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import f1_score, accuracy_score
warnings.filterwarnings('ignore')
%matplotlib inline

Get a high-level overview of the data we have here...

In [2]:
df = pd.read_csv("../../Datasets/spaceship-titanic/train.csv")

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


As we can see from the tables above, some of the features contain several types of values (e.g the feature "Cabin" contains three different values and doesn't follow atomic structure)...Let's do some feature engineering first

In [3]:
# Parse the "PassengerId" column from the dataset
df["PassengerGroup"] = df["PassengerId"].str.split("_").str[0]
df["PassengerNumber"] = df["PassengerId"].str.split("_").str[1]
df = df.drop("PassengerId", axis=1)

# Parse the "Cabin" feature that contains several values
df['CabinDeck'] = df['Cabin'].str.split("/").str[0]
df["CabinNum"] = df['Cabin'].str.split("/").str[1]
df['CabinSide'] = df['Cabin'].str.split("/").str[2]
df = df.drop("Cabin", axis=1)

# Get passenger's last name
df['LastName'] = df['Name'].str.split(" ").str[1]

# Drop the "Name" column as it is not needed anymore
df = df.drop(["Name"], axis=1)

print(df.head())

print(len(df['LastName'].unique()))

df.isnull().sum()

  HomePlanet CryoSleep  Destination  ...  CabinNum CabinSide     LastName
0     Europa     False  TRAPPIST-1e  ...         0         P    Ofracculy
1      Earth     False  TRAPPIST-1e  ...         0         S        Vines
2     Europa     False  TRAPPIST-1e  ...         0         S       Susent
3     Europa     False  TRAPPIST-1e  ...         0         S       Susent
4      Earth     False  TRAPPIST-1e  ...         1         S  Santantines

[5 rows x 17 columns]
2218


HomePlanet         201
CryoSleep          217
Destination        182
Age                179
VIP                203
RoomService        181
FoodCourt          183
ShoppingMall       208
Spa                183
VRDeck             188
Transported          0
PassengerGroup       0
PassengerNumber      0
CabinDeck          199
CabinNum           199
CabinSide          199
LastName           200
dtype: int64

Let's check the missing values first. As we can see, almost each column in the data set contains missing values. The following step will be training a new model which will impute missing values

In [4]:
# Get all the missing values
df.isnull().sum()

HomePlanet         201
CryoSleep          217
Destination        182
Age                179
VIP                203
RoomService        181
FoodCourt          183
ShoppingMall       208
Spa                183
VRDeck             188
Transported          0
PassengerGroup       0
PassengerNumber      0
CabinDeck          199
CabinNum           199
CabinSide          199
LastName           200
dtype: int64

In [5]:
# Impute the missing values using the SimpleImputer
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="most_frequent")

# Get both categorical and numerical columns
categorical_columns = [col for col in df.columns if df[col].dtype == "object"]
numerical_columns = [col for col in df.columns if df[col].dtype == "int64" or df[col].dtype == "float64"]

print("All features", df.columns)
print("Categorical features", categorical_columns)
print("Numerical features", numerical_columns)

df[categorical_columns] = imputer.fit_transform(df[categorical_columns])

All features Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported',
       'PassengerGroup', 'PassengerNumber', 'CabinDeck', 'CabinNum',
       'CabinSide', 'LastName'],
      dtype='object')
Categorical features ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroup', 'PassengerNumber', 'CabinDeck', 'CabinNum', 'CabinSide', 'LastName']
Numerical features ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


In [6]:
# Encode the categorical features

ordinal_columns = ['CabinDeck']
non_ordinal_columns = [col for col in categorical_columns if col not in ordinal_columns]

le = LabelEncoder()
ohe = OneHotEncoder()

print(ordinal_columns)
print(non_ordinal_columns)

# Before we start encoding the features, we need to convert the NaN values to string
for col in categorical_columns:

    # Encode the features
    if col in ordinal_columns:
        df[col] = le.fit_transform(df[col].astype(str))
    else:
        # Encode the non-ordinal features using the CountEncoder
        count_map = df[col].value_counts().to_dict()
        df[col+"_count"] = df[col].map(count_map)
        
        # Drop the original column
        df = df.drop(col, axis=1)


print(df.head())
print(df.isnull().sum())

['CabinDeck']
['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroup', 'PassengerNumber', 'CabinNum', 'CabinSide', 'LastName']
    Age  RoomService  ...  CabinSide_count  LastName_count
0  39.0          0.0  ...             4206               1
1  24.0        109.0  ...             4487               4
2  58.0         43.0  ...             4487               6
3  33.0          0.0  ...             4487               6
4  16.0        303.0  ...             4487               6

[5 rows x 17 columns]
Age                      179
RoomService              181
FoodCourt                183
ShoppingMall             208
Spa                      183
VRDeck                   188
Transported                0
CabinDeck                  0
HomePlanet_count           0
CryoSleep_count            0
Destination_count          0
VIP_count                  0
PassengerGroup_count       0
PassengerNumber_count      0
CabinNum_count             0
CabinSide_count            0
LastName_count       

In [7]:
# Build a pipline that we will use to impute the missing values
param_grid = {'imputer__knn_imputer__n_neighbors': [3, 5, 7, 9, 11]}

imputer_pipeline = Pipeline([
    ('imputer', ColumnTransformer([
    ('knn_imputer', KNNImputer(), numerical_columns)
    ], remainder='passthrough'))
])

grid_search = GridSearchCV(imputer_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(df, df['Transported'])


In [8]:
best_params = grid_search.best_params_
print("Best parametes are: {}".format(best_params))

imputer = KNNImputer(n_neighbors=best_params['imputer__knn_imputer__n_neighbors'])

df_imputed = pd.DataFrame(imputer.fit_transform(df[numerical_columns]), columns=numerical_columns)

Best parametes are: {'imputer__knn_imputer__n_neighbors': 3}


In [9]:
df_imputed = pd.concat([df_imputed, df.drop(numerical_columns, axis=1)], axis=1)
df = df_imputed.copy()

print(df_imputed.head())
print(df_imputed.isnull().sum())
print(df_imputed['CryoSleep_count'].unique())

    Age  RoomService  ...  CabinSide_count  LastName_count
0  39.0          0.0  ...             4206               1
1  24.0        109.0  ...             4487               4
2  58.0         43.0  ...             4487               6
3  33.0          0.0  ...             4487               6
4  16.0        303.0  ...             4487               6

[5 rows x 17 columns]
Age                      0
RoomService              0
FoodCourt                0
ShoppingMall             0
Spa                      0
VRDeck                   0
Transported              0
CabinDeck                0
HomePlanet_count         0
CryoSleep_count          0
Destination_count        0
VIP_count                0
PassengerGroup_count     0
PassengerNumber_count    0
CabinNum_count           0
CabinSide_count          0
LastName_count           0
dtype: int64
[5656 3037]


After we have imputed all the features that can be predicted with the model, I will replace the "Name" column with the new category - "Unknown"

In [10]:
# new_category = 'Unknown'

# missing_names = df['Name'].isnull()

# df.loc[missing_names, 'Name'] = new_category

Before scaling the parameters we have to make sure that the dataset contain only numerical values

In [11]:
numeric_features = [x for x in df.columns if df[x].dtype != 'object' and df[x].dtype != 'bool']

In [12]:
# Sum up all the amenities costs
df["TotalAmenities"] = df['RoomService'] + df['FoodCourt'] + \
    df['ShoppingMall'] + df['Spa'] + df['VRDeck']


print(df.iloc[16])

print(df.iloc[8667])

Age                        27.0
RoomService              1286.0
FoodCourt                 122.0
ShoppingMall              162.0
Spa                         0.0
VRDeck                      0.0
Transported               False
CabinDeck                     5
HomePlanet_count           1759
CryoSleep_count            5656
Destination_count          1800
VIP_count                  8494
PassengerGroup_count          1
PassengerNumber_count      6217
CabinNum_count               16
CabinSide_count            4206
LastName_count                1
TotalAmenities           1570.0
Name: 16, dtype: object
Age                        29.0
RoomService                 0.0
FoodCourt                2972.0
ShoppingMall              256.0
Spa                        28.0
VRDeck                    188.0
Transported                True
CabinDeck                     4
HomePlanet_count           2131
CryoSleep_count            5656
Destination_count          6097
VIP_count                  8494
PassengerGroup_c

In [13]:
# Replace the dependent variable with 0 and 1
df['Transported'] = df['Transported'].replace({False: 0, True: 1})

df[numeric_features].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinDeck,HomePlanet_count,CryoSleep_count,Destination_count,VIP_count,PassengerGroup_count,PassengerNumber_count,CabinNum_count,CabinSide_count,LastName_count
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,28.811381,223.724721,453.421067,172.401472,308.013996,302.005905,4.321293,3532.043138,4741.022087,4721.848039,8304.111009,2.035546,4722.021051,13.690671,4351.041643,10.732889
std,14.368746,662.927512,1599.741507,598.647138,1126.808591,1135.601511,1.760826,1417.754407,1248.728133,2122.69795,1240.664507,1.596347,2385.868889,35.290842,140.434655,33.361785
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1759.0,3037.0,796.0,199.0,1.0,13.0,1.0,4206.0,1.0
25%,20.0,0.0,0.0,0.0,0.0,0.0,3.0,2131.0,3037.0,1800.0,8494.0,1.0,1412.0,4.0,4206.0,3.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,5.0,4803.0,5656.0,6097.0,8494.0,1.0,6217.0,7.0,4487.0,5.0
75%,38.0,48.0,79.0,29.0,60.0,46.0,6.0,4803.0,5656.0,6097.0,8494.0,3.0,6217.0,12.0,4487.0,7.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,7.0,4803.0,5656.0,6097.0,8494.0,8.0,6217.0,227.0,4487.0,218.0


In [14]:
from sklearn.linear_model import LogisticRegression

# Instantiate a copy of the dataframe for the Logistic Regression model
df_lr = df.copy()

# Standardize the numerical features
scaler = StandardScaler()
for col in [col for col in df_lr.columns if col not in ['Transported']]:
    df_lr[col] = scaler.fit_transform(df_lr[col].values.reshape(-1, 1))

# Split the dataset into train and test subsets
y = df_lr['Transported']
X = df_lr.drop("Transported", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=128)

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)
print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")


F1 score: 0.8062940857297883
Accuracy score: 0.7947096032202415


In [15]:

from sklearn.svm import SVC

# Instantiate a copy of the dataframe for the SVM model
df_svm = df.copy()

# Split the dataset before standardizing it's features
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=128)

# Standardize the numerical features
scaler = StandardScaler()

for col in [col for col in df_svm.columns if col not in ['Transported']]:
    X_train[col] = scaler.fit_transform(X_train[col].values.reshape(-1, 1))
    X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))

# Training the model
svm_model = SVC(kernel='linear', C=1)

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")


F1 score: 0.7997799779977998
Accuracy score: 0.7906843013225991


In [16]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

df_dl = df.copy()

# Split the dataset before standardizing it's features
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=128)

# Standardize the numerical features
scaler = StandardScaler()

for col in [col for col in df_dl.columns if col not in ['Transported']]:
    X_train[col] = scaler.fit_transform(X_train[col].values.reshape(-1, 1))
    X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))

# Training the deep learning model
dl_model = Sequential()
dl_model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
dl_model.add(Dense(16, activation='relu'))
dl_model.add(Dense(1, activation='sigmoid'))

dl_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

dl_model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)

y_pred = dl_model.predict(X_test)

y_pred = (y_pred > 0.5).astype('int32')

print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

TypeError: 'int' object is not iterable