# Space Ship Titanic Kaggle Competition


1. imports
2. modules
3. load data
4. explore data
5. preprocess data
6. train model
7. evaluate model
8. write submission.csv

## Imports

In [36]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os

## Custom Modules

### Custom Modules

## Load Test and training Data

In [37]:

try:
    # Try to load Kaggle dataset
    train_data_path = '/kaggle/input/spaceship-titanic/train.csv'
    test_data_path = '/kaggle/input/spaceship-titanic/test.csv'

    train = pd.read_csv(train_data_path)
    test = pd.read_csv(test_data_path)
    print("Running notebook in Kaggle environment")

except FileNotFoundError:
    # Fall back to local dataset
    train_data_path = "./data/spaceship-titanic/train.csv"
    test_data_path = "./data/spaceship-titanic/test.csv"
    print("Running notebook in local environment")


train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

Running notebook in local environment


In [38]:
train.shape

(8693, 14)

In [39]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [40]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [41]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [42]:
train.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64

In [43]:
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [44]:
# get all categorical columns
train_categorical_cols = train.select_dtypes(include=['object']).columns.tolist()
print(train_categorical_cols)

['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']


In [45]:
# Get all numerical columns
train_numerical_cols = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(train_numerical_cols)

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


In [46]:
# view rows with missing Name
train[train['Name'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
27,0022_01,Mars,False,D/0/P,TRAPPIST-1e,21.0,False,980.0,2.0,69.0,0.0,0.0,,False
58,0064_01,Mars,True,F/14/S,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,,True
65,0069_01,Earth,False,F/16/S,TRAPPIST-1e,42.0,False,887.0,0.0,9.0,6.0,0.0,,True
77,0082_03,Mars,False,F/16/P,TRAPPIST-1e,8.0,False,0.0,0.0,0.0,0.0,0.0,,True
101,0108_02,Earth,False,G/19/S,TRAPPIST-1e,31.0,False,562.0,0.0,326.0,0.0,0.0,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8629,9205_02,Europa,True,B/300/P,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,,True
8631,9208_01,Earth,True,G/1485/S,TRAPPIST-1e,35.0,False,0.0,0.0,0.0,0.0,0.0,,True
8636,9218_01,Europa,True,B/353/S,55 Cancri e,43.0,False,0.0,0.0,0.0,0.0,0.0,,True
8652,9230_01,Europa,False,C/342/S,TRAPPIST-1e,36.0,True,0.0,5600.0,715.0,2868.0,971.0,,True


## PreProcess Data

In [47]:
# label encode categorical columns
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in train_categorical_cols:
    train[col] = le.fit_transform(train[col])

train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0,1,0,149,2,39.0,0,0.0,0.0,0.0,0.0,0.0,5252,False
1,1,0,0,2184,2,24.0,0,109.0,9.0,25.0,549.0,44.0,4502,True
2,2,1,0,1,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,457,False
3,3,1,0,1,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,7149,False
4,4,0,0,2186,2,16.0,0,303.0,70.0,151.0,565.0,2.0,8319,True


In [49]:


# Separate the data into rows with missing values and rows without
df_no_missing = train.dropna()
df_with_missing = train.loc[train.isna().any(axis=1)]

# Standardize the features (important for autoencoders)
scaler = StandardScaler()
df_no_missing_scaled = scaler.fit_transform(df_no_missing)


In [50]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Define the autoencoder architecture
input_layer = Input(shape=(df_no_missing_scaled.shape[1],))
encoder = Dense(128, activation='relu')(input_layer)
encoder = Dense(64, activation='relu')(encoder)
latent_space = Dense(32, activation='relu')(encoder)
decoder = Dense(64, activation='relu')(latent_space)
decoder = Dense(128, activation='relu')(decoder)
output_layer = Dense(df_no_missing_scaled.shape[1], activation='linear')(decoder)

# Compile the autoencoder model
autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Display the autoencoder architecture
autoencoder.summary()

ModuleNotFoundError: No module named 'tensorflow'

## Train Model

## Evaluate Model

## Create Submission