# Space Ship Titanic Kaggle Competition


1. imports
2. modules
3. load data
4. explore data
5. preprocess data
6. train model
7. evaluate model
8. write submission.csv

In [1]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [2]:
# Clear any logs from previous runs
!rm -rf ./logs/

## Imports

In [3]:
# Standard Libraries
import numpy as np
import pandas as pd
from itertools import product
from datetime import datetime

# Scikit-Learn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# TensorFlow and Keras
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers, layers
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import keras_tuner
from kerastuner.tuners import RandomSearch


# Plotting
import matplotlib.pyplot as plt

# Typing
from typing import Tuple, Dict,List
from numpy import ndarray
from pandas import DataFrame


2023-09-15 14:00:27.535739: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from kerastuner.tuners import RandomSearch


## Custom Modules

### Fill String Cols

In [4]:
def fill_string_col(df: DataFrame, cols: List[str], default: str = 'Unknown') -> DataFrame:
    for col in cols:
        df[col] = df[col].fillna(default)
    return df

### Fill Most Common

In [5]:
def fill_most_common(df: DataFrame, cols: list) -> DataFrame:
    for col in cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

### Fill

### Fill Average Round

In [6]:
def fill_average_round(df: DataFrame, cols: list) -> DataFrame:
    for col in cols:
        df[col] = df[col].fillna(round(df[col].mean()))
    return df

### Split Data on Char

In [7]:
def split_data_on_char(df: DataFrame, col: str, split_on_char: str) -> DataFrame:
    """
    Splits a DataFrame column into multiple columns based on a character.
    Leaves NaN values unchanged.

    Parameters:
        df (DataFrame): Input DataFrame
        col (str): Column name to split
        split_on_char (str): Character to split on

    Returns:
        DataFrame: DataFrame with new columns
    """
    # Split the column
    split_cols = df[col].str.split(split_on_char, expand=True)
    split_cols.columns = [f"{col}_{i+1}" for i in range(split_cols.shape[1])]

    # Drop the original column and concatenate the new columns
    df = pd.concat([df.drop(columns=[col]), split_cols], axis=1)

    return df


### Set Transported to binary

In [8]:
def set_transported_to_binary(df: DataFrame) -> DataFrame:
    if "Transported" in df.columns:
        df["Transported"] = df["Transported"].apply(lambda x: 1 if x == "True" else 0)
    return df


### Downcast DataFrame

In [9]:
def downcast_dataframe(df):
    """
    Downcast the columns of a Pandas DataFrame to the most efficient data types.
    
    Parameters:
    - df: Pandas DataFrame
    
    Returns:
    - DataFrame with downcasted columns
    """
    df_downcasted = df.copy()
    
    # Downcast int and float types
    for col in df_downcasted.select_dtypes(include=['int64', 'float64']).columns:
        if pd.api.types.is_integer_dtype(df_downcasted[col]):
            df_downcasted[col] = pd.to_numeric(df_downcasted[col], downcast='integer')
        elif pd.api.types.is_float_dtype(df_downcasted[col]):
            df_downcasted[col] = pd.to_numeric(df_downcasted[col], downcast='float')
    
    # Downcast object types to category if unique values are less than 50% of total values
    for col in df_downcasted.select_dtypes(include=['object']).columns:
        num_unique_values = len(df_downcasted[col].unique())
        num_total_values = len(df_downcasted[col])
        if num_unique_values / num_total_values < 0.5:
            df_downcasted[col] = df_downcasted[col].astype('category')
            
    return df_downcasted


### Label Encode

In [10]:
def label_encode(df: DataFrame) -> Tuple[DataFrame, Dict[str, LabelEncoder]]:
    label_encoders = {}
    le = LabelEncoder()
    categorical_cols = df.select_dtypes(include=["object"]).columns
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    return df, label_encoders

def label_encode_2(df: DataFrame, cols: List[str]) -> Tuple[DataFrame, Dict[str, LabelEncoder]]:
    label_encoders = {}
    le = LabelEncoder()
    for col in cols:
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    return df, label_encoders

### One Hot Encode

In [11]:
def one_hot_encode(df: DataFrame, cols: List[str]) -> DataFrame:
    """
    Perform one-hot encoding on the specified columns of a DataFrame.
    
    Parameters:
    - df: Pandas DataFrame
    - columns_to_encode: list of column names to one-hot encode
    
    Returns:
    - DataFrame with one-hot encoded columns
    """
    # Perform one-hot encoding
    df_encoded = pd.get_dummies(df, columns=cols, drop_first=False)
    
    return df_encoded


### Separate Data

In [12]:
def separate_data(df: DataFrame) -> (DataFrame, DataFrame):
    df_no_missing = df.dropna()
    df_with_missing = df.loc[df.isna().any(axis=1)]
    return df_no_missing, df_with_missing


### Scale Features

In [13]:
def scale_features(
    df: DataFrame, target_col: str = None
) -> (DataFrame, StandardScaler):
    """
    Scales the features of a DataFrame.

    Parameters:
        df (DataFrame): The DataFrame to scale.
        target_col (str): The target column to exclude from scaling.

    Returns:
        DataFrame: The scaled DataFrame.
        StandardScaler: The scaler used for scaling.
    """
    scaler = StandardScaler()

    # Separate target if it exists
    if target_col and target_col in df.columns:
        target = df[target_col]
        features = df.drop(columns=[target_col])
    else:
        features = df

    # Scale features
    features_scaled = scaler.fit_transform(features)

    # Combine features and target back into a single DataFrame
    if target_col and target_col in df.columns:
        df_scaled = DataFrame(features_scaled, columns=features.columns, index=df.index)
        df_scaled[target_col] = target
    else:
        df_scaled = DataFrame(features_scaled, columns=features.columns, index=df.index)

    return df_scaled, scaler


### Train Autoencoder

In [14]:
def train_autoencoder(df_no_missing_scaled: DataFrame) -> Model:
    input_layer = Input(shape=(df_no_missing_scaled.shape[1],))
    encoder = Dense(128, activation="relu")(input_layer)
    encoder = Dense(64, activation="relu")(encoder)
    latent_space = Dense(32, activation="relu")(encoder)
    decoder = Dense(64, activation="relu")(latent_space)
    decoder = Dense(128, activation="relu")(decoder)
    output_layer = Dense(df_no_missing_scaled.shape[1], activation="linear")(decoder)
    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    autoencoder.compile(optimizer="adam", loss="mean_squared_error")
    autoencoder.fit(
        df_no_missing_scaled, df_no_missing_scaled, epochs=50, batch_size=128, verbose=0
    )
    return autoencoder


### Impute Missing Values

In [15]:
def impute_missing_values(
    df_with_missing: DataFrame, autoencoder: Model, scaler: StandardScaler
) -> DataFrame:
    # Scale the feature set with missing values
    df_with_missing_scaled = scaler.transform(df_with_missing.fillna(0))

    # Use the autoencoder to predict the missing values
    imputed_values_scaled = autoencoder.predict(df_with_missing_scaled)

    # Inverse transform to get the original feature values
    imputed_values = scaler.inverse_transform(imputed_values_scaled)

    # Replace the missing values in the original DataFrame
    df_imputed = df_with_missing.copy()
    is_null = df_with_missing.isna()
    for i in range(df_with_missing.shape[0]):
        for j in range(df_with_missing.shape[1]):
            if is_null.iloc[i, j]:
                df_imputed.iloc[i, j] = imputed_values[i, j]

    return df_imputed


### PreProcess

In [16]:
from typing import Tuple


def preprocess(
    df: DataFrame, target_col: str = "Transported"
) -> Tuple[DataFrame, Dict[str, LabelEncoder]]:
    
    # Handle missing names
    df = fill_string_col(df, ['Name'], 'Unknown')


    # Split cabin column
    df = split_data_on_char(df, "Cabin", "/")

    # fill mode
    mode_fill_cols = ['HomePlanet', 'CryoSleep', 'Destination','VIP', 'Cabin_1','Cabin_2','Cabin_3']
    df = fill_most_common(df, mode_fill_cols)

    # fill average
    fill_average_cols = ['Age', 'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
    df = fill_average_round(df, fill_average_cols)

    # Label Encode
    df, label_encoders = label_encode_2(df, ['Destination','HomePlanet','Name','Cabin_1','Cabin_3'])

    # One-Hot Encode
    df = one_hot_encode(df, ['Destination','HomePlanet','Cabin_3'])
    # data type mapping
    mapping = {
        "PassengerId": "category",
        "CryoSleep": "int8",
        "Age": "int16",
        "VIP": "int8",
        "RoomService": "int16",
        "FoodCourt": "int16",
        "ShoppingMall": "int16",
        "Spa": "int16",
        "VRDeck": "int16",
        "Name": "int16",
        "Cabin_1": "int8",
        "Cabin_2": "int16",
        "Destination_0": "int8",
        "Destination_1": "int8",
        "Destination_2": "int8",
        "HomePlanet_0": "int8",
        "HomePlanet_1": "int8",
        "HomePlanet_2": "int8",
        "Cabin_3_0": "int8",
        "Cabin_3_1": "int8",
    }

    df = df.astype(mapping)

    # Downcast
    df = downcast_dataframe(df)

    # Separate out the target column
    if target_col in df.columns:
        target = df[target_col]
        features = df.drop(columns=[target_col])
    else:
        features = df

    # # If the target exists, add it back
    if target_col in df.columns:
        features[target_col] = target

    return features, label_encoders



### Plot Training History

In [17]:
def plot_training_history_with_metrics(history, y_true, y_pred_proba):
    """
    Plots the training and validation loss, accuracy, and provides interpretations notes.
    Also calculates F1 Score and ROC AUC Score.

    Parameters:
        history (History): History object returned from model training.
        y_true (array-like): True labels for the validation set.
        y_pred_proba (array-like): Predicted probabilities for the positive class.

    """

    # Extract training and validation loss
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]

    # Extract training and validation accuracy if it exists
    accuracy = history.history.get("accuracy")
    val_accuracy = history.history.get("val_accuracy")

    # Create figure and axis
    fig, ax1 = plt.subplots()

    # Plot loss
    color = "tab:red"
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss", color=color)
    ax1.plot(loss, label="Train Loss", color=color)
    ax1.tick_params(axis="y", labelcolor=color)

    # Create another y-axis for the accuracies
    ax2 = ax1.twinx()

    color = "tab:blue"
    ax2.set_ylabel("Accuracy", color=color)

    # Plot accuracy if it exists
    if accuracy and val_accuracy:
        ax2.plot(accuracy, label="Train Acc", color=color)
        ax2.tick_params(axis="y", labelcolor=color)

    # Plot validation data if it exists
    if val_loss:
        ax1.plot(val_loss, label="Val Loss", color="tab:orange")

    if val_accuracy:
        ax2.plot(val_accuracy, label="Val Acc", color="tab:cyan")

    # Add legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    lines = lines1 + lines2
    labels = labels1 + labels2
    ax1.legend(lines, labels, loc=0)

    plt.show()

    # Additional Metrics
    y_pred = np.round(y_pred_proba)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    print(f"\nAdditional Metrics:")
    print(f"F1 Score: {f1}")
    print(f"ROC AUC Score: {roc_auc}")
    print()

    # Interpretation
    print("Interpreting the Graph:")
    print(
        "- F1 Score: 2 * (precision * recall) / (precision + recall).  The F1 Score is the harmonic mean of precision and recall. Precision measures how many of the items identified as positive are actually positive, while recall measures how many of the actual positive items are identified correctly."
    )
    print(
        "- ROC AUC Score: The Area Under the Receiver Operating Characteristic Curve (ROC AUC) is a measure of how well a model can distinguish between classes. An ROC AUC of 0.5 means the model is randomly guessing, while an ROC AUC of 1.0 means the model is perfectly distinguishing between positive and negative classes."
    )
    print("- A lower 'Loss' means the model is performing better.")
    print(
        "- 'Train Loss' and 'Val Loss' should follow a similar trend. If 'Val Loss' starts increasing while 'Train Loss' continues decreasing, it's likely the model is overfitting."
    )
    print(
        "- Higher 'Accuracy' is generally better, but be cautious if accuracy is very high as it might be a sign of overfitting or a too-simple model."
    )
    print(
        "- 'Train Acc' and 'Val Acc' should also follow similar trends. If 'Val Acc' plateaus or starts decreasing, consider stopping the training or modifying the model."
    )


### Create Submission Dataframe

In [18]:
def create_submission_dataframe(
    y_test_pred: ndarray, passenger_ids: ndarray, threshold: float = 0.5
) -> DataFrame:
    """
    Create a submission DataFrame.

    Parameters:
        y_test_pred (np.ndarray): Model predictions.
        passenger_ids (np.ndarray): IDs corresponding to the test set.
        threshold (float): Threshold for classifying as 1.

    Returns:
        DataFrame: Submission DataFrame.
    """
    # Convert probabilities to binary outputs
    y_test_pred_binary = (y_test_pred > threshold).astype(int).flatten()

    # Create a DataFrame for submission
    submission_df = DataFrame(
        {"PassengerId": passenger_ids, "Transported": y_test_pred_binary}
    )

    return submission_df


## Load Test and training Data

In [19]:
try:
    # Try to load Kaggle dataset
    train_data_path = "/kaggle/input/spaceship-titanic/train.csv"
    test_data_path = "/kaggle/input/spaceship-titanic/test.csv"

    train = pd.read_csv(train_data_path)
    test = pd.read_csv(test_data_path)
    print("Running notebook in Kaggle environment")

except FileNotFoundError:
    # Fall back to local dataset
    train_data_path = "./data/spaceship-titanic/train.csv"
    test_data_path = "./data/spaceship-titanic/test.csv"
    print("Running notebook in local environment")


train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)


Running notebook in local environment


In [20]:
print(f"train shape:{train.shape}")
print(f"test shape:{test.shape}")


train shape:(8693, 14)
test shape:(4277, 13)


In [21]:
print(f"train head:")
train.head()


train head:


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [22]:
print(f"test head: {test.head()}")


test head:   PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0013_01      Earth      True  G/3/S  TRAPPIST-1e  27.0  False   
1     0018_01      Earth     False  F/4/S  TRAPPIST-1e  19.0  False   
2     0019_01     Europa      True  C/0/S  55 Cancri e  31.0  False   
3     0021_01     Europa     False  C/1/S  TRAPPIST-1e  38.0  False   
4     0023_01      Earth     False  F/5/S  TRAPPIST-1e  20.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck              Name  
0          0.0        0.0           0.0     0.0     0.0   Nelly Carsoning  
1          0.0        9.0           0.0  2823.0     0.0    Lerome Peckers  
2          0.0        0.0           0.0     0.0     0.0   Sabih Unhearfus  
3          0.0     6652.0           0.0   181.0   585.0  Meratz Caltilter  
4         10.0        0.0         635.0     0.0     0.0   Brence Harperez  


In [23]:
print(f"train info: {train.info()}")
print(f"test info: {test.info()}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
train info: None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column       

In [24]:
print(f"train describe: {train.describe()}")
print(f"test describe: {test.describe()}")


train describe:                Age   RoomService     FoodCourt  ShoppingMall           Spa  \
count  8514.000000   8512.000000   8510.000000   8485.000000   8510.000000   
mean     28.827930    224.687617    458.077203    173.729169    311.138778   
std      14.489021    666.717663   1611.489240    604.696458   1136.705535   
min       0.000000      0.000000      0.000000      0.000000      0.000000   
25%      19.000000      0.000000      0.000000      0.000000      0.000000   
50%      27.000000      0.000000      0.000000      0.000000      0.000000   
75%      38.000000     47.000000     76.000000     27.000000     59.000000   
max      79.000000  14327.000000  29813.000000  23492.000000  22408.000000   

             VRDeck  
count   8505.000000  
mean     304.854791  
std     1145.717189  
min        0.000000  
25%        0.000000  
50%        0.000000  
75%       46.000000  
max    24133.000000  
test describe:                Age   RoomService     FoodCourt  ShoppingMall        

In [25]:
print(f"train nunique: {train.nunique()}")
print(f"test nunique: {test.nunique()}")


train nunique: PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64
test nunique: PassengerId     4277
HomePlanet         3
CryoSleep          2
Cabin           3265
Destination        3
Age               79
VIP                2
RoomService      842
FoodCourt        902
ShoppingMall     715
Spa              833
VRDeck           796
Name            4176
dtype: int64


In [26]:
print(f"train isnull: {train.isnull().sum()}")
print(f"test isnull: {test.isnull().sum()}")


train isnull: PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64
test isnull: PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64


In [27]:
print(f"train nulls: {train.isnull().sum()}")
print(f"test nulls: {test.isnull().sum()}")

train nulls: PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64
test nulls: PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64


In [28]:
print(f"train nulls: {train.isnull().sum() / train.shape[0]}")
print(f"test nulls: {test.isnull().sum() / test.shape[0]}")

train nulls: PassengerId     0.000000
HomePlanet      0.023122
CryoSleep       0.024963
Cabin           0.022892
Destination     0.020936
Age             0.020591
VIP             0.023352
RoomService     0.020821
FoodCourt       0.021051
ShoppingMall    0.023927
Spa             0.021051
VRDeck          0.021627
Name            0.023007
Transported     0.000000
dtype: float64
test nulls: PassengerId     0.000000
HomePlanet      0.020341
CryoSleep       0.021744
Cabin           0.023381
Destination     0.021510
Age             0.021277
VIP             0.021744
RoomService     0.019172
FoodCourt       0.024784
ShoppingMall    0.022913
Spa             0.023615
VRDeck          0.018705
Name            0.021978
dtype: float64


In [29]:
# get all categorical columns
train_categorical_cols = train.select_dtypes(include=["object"]).columns.tolist()
print(train_categorical_cols)


['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']


## Categorical Columns Strategy

### PassengerId

- Unique Identifier
- No encoding

### HomePlanet

- fill missing with mode
- One-Hot Encode

### CryoSleep

- Fill missing with mode
- Binary

### Cabin

Cabin seems like an interesting feature.  There is three fields in Cabin 'B/0/P' 'char'/'int8'/'char'

- Split data on '/'
  - Cabin_1 = 'char'
  - Cabin_2 = 'int8'
  - Cabin_3 = 'char'
- Fill NaN
  - Cabin_1 = Most Frequent
  - Cabin_2 = Median
  - Cabin_3 = Most Frequent
- Encode
  - Need to see the distributions of of the newly created features.

### Destination

- Fill missing with mode
- Label Encode

### VIP

- Fill missing with median
- Binary

### Name
- fill missing with 'Unknown'
- Label Encode
- Suspect column will not be relevant

In [30]:



mode_fill_cols = ['HomePlanet', 'CryoSleep', 'Destination','VIP']
train = fill_most_common(train, mode_fill_cols)
test = fill_most_common(test, mode_fill_cols)

train = fill_string_col(train, ['Name'])
test = fill_string_col(test, ['Name'])

In [31]:
train.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name              0
Transported       0
dtype: int64

In [32]:
# # Split Cabin
# train = split_data_on_char(train, "Cabin", "/")
# test = split_data_on_char(test, "Cabin", "/")

In [33]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [34]:
train.tail()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False
8692,9280_02,Europa,False,E/608/S,TRAPPIST-1e,44.0,False,126.0,4688.0,0.0,0.0,12.0,Propsh Hontichre,True


In [35]:
train.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8474
Transported        2
dtype: int64

In [36]:
print(f"train nulls: {train.isnull().sum() / train.shape[0]}")
print(f"test nulls: {test.isnull().sum() / test.shape[0]}")

train nulls: PassengerId     0.000000
HomePlanet      0.000000
CryoSleep       0.000000
Cabin           0.022892
Destination     0.000000
Age             0.020591
VIP             0.000000
RoomService     0.020821
FoodCourt       0.021051
ShoppingMall    0.023927
Spa             0.021051
VRDeck          0.021627
Name            0.000000
Transported     0.000000
dtype: float64
test nulls: PassengerId     0.000000
HomePlanet      0.000000
CryoSleep       0.000000
Cabin           0.023381
Destination     0.000000
Age             0.021277
VIP             0.000000
RoomService     0.019172
FoodCourt       0.024784
ShoppingMall    0.022913
Spa             0.023615
VRDeck          0.018705
Name            0.000000
dtype: float64


### Cabin_1

- fill with mode
- One-Hot Encode

### Cabin_2

- fill with mode
- int64 maybe smaller depending on width

### Cabin_3

- fill with mode
- label encode
- int8

In [37]:
# Get all numerical columns
# train_numerical_cols = train.select_dtypes(
#     include=["int64", "float64"]
# ).columns.tolist()
# print(train_numerical_cols)


## Numerical Columns Strategy

### Age

- Fill with average whole number
- float32

### RoomService

- Fill with average whole number
- float64


### FoodCourt

- Fill with average whole number
- float64


### ShoppingMall

- Fill with average whole number
- float64

### Spa

- Fill with average whole number
- float64

### VRDeck

- Fill with average whole number
- float64

In [38]:
# fill_average_cols = ['Age', 'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

# train = fill_average_round(train, fill_average_cols)
# test = fill_average_round(test, fill_average_cols)

In [39]:
# train.dtypes

## PreProcess Test and Training Datasets

In [40]:
# Step 1: Preprocess the data to fill missing values
train_processed, train_label_encoder = preprocess(train)
test_processed, test_label_encoder = preprocess(test)


In [41]:
# display all columns
pd.set_option("display.max_columns", None)
train_processed.head()


Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Cabin_1,Cabin_2,Destination_0,Destination_1,Destination_2,HomePlanet_0,HomePlanet_1,HomePlanet_2,Cabin_3_0,Cabin_3_1,Transported
0,0001_01,0,39,0,0,0,0,0,0,5252,1,0,0,0,1,0,1,0,1,0,False
1,0002_01,0,24,0,109,9,25,549,44,4502,5,0,0,0,1,1,0,0,0,1,True
2,0003_01,0,58,1,43,3576,0,6715,49,457,0,0,0,0,1,0,1,0,0,1,False
3,0003_02,0,33,0,0,1283,371,3329,193,7149,0,0,0,0,1,0,1,0,0,1,False
4,0004_01,0,16,0,303,70,151,565,2,8320,5,1,0,0,1,1,0,0,0,1,True


In [42]:
test_processed.shape
test_processed.dtypes

PassengerId      category
CryoSleep            int8
Age                 int16
VIP                  int8
RoomService         int16
FoodCourt           int16
ShoppingMall        int16
Spa                 int16
VRDeck              int16
Name                int16
Cabin_1              int8
Cabin_2             int16
Destination_0        int8
Destination_1        int8
Destination_2        int8
HomePlanet_0         int8
HomePlanet_1         int8
HomePlanet_2         int8
Cabin_3_0            int8
Cabin_3_1            int8
dtype: object

## Call Backs

##### Early Stopping

In [43]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=6,
    verbose=1,
    mode='auto'
)

##### Tensorboard

## Features

In [44]:
# List of features to include
selected_features = [
    "CryoSleep",
    "Age",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "VIP",
    "Spa",
    "VRDeck",
    "Cabin_1",
    "Cabin_2",
    "Cabin_3_0",
    "Cabin_3_1",
    "Destination_0",
    "Destination_1",
    "Destination_2",
    "HomePlanet_0",
    "HomePlanet_1",
    "HomePlanet_2",
]


## Data Splitting

In [45]:

# Split features and target variable from training data
x_train_all = train_processed.drop("Transported", axis=1)
y_train = train_processed["Transported"]

# Filter the training data to keep only selected features
x_train = x_train_all[selected_features]

# Assuming test_processed contains all potential features
x_test_all = test_processed

# Keep only columns also present in the training set
x_test = x_test_all[x_train.columns]


## Data Scaling

In [59]:
# Scale features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

x_train_scaled = x_train
x_test_scaled = x_test

## Model Architecture

In [60]:
class MyHyperModel(keras_tuner.HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape
    
    def build(self, hp: Dict[str, float]) -> Model:
        model = Sequential()
        for i in range(hp.Int("num_layers", 1, 3)):
            kwargs = {
                "units": hp.Int(f"units_{i}", min_value=32, max_value=512, step=32),
                "activation": hp.Choice("activation", ["relu", "tanh"]),
            }
            if i == 0:
                kwargs["input_shape"] = (self.input_shape,)
            model.add(layers.Dense(**kwargs))

        if hp.Boolean("dropout"):
            model.add(layers.Dropout(rate=0.25))

        learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")

        model.add(layers.Dense(1, activation="sigmoid"))

        model.compile(
            optimizer=Adam(learning_rate=learning_rate),
            loss="binary_crossentropy",
            metrics=["accuracy"],
        )
        return model

    def fit(self, hp, model, x, y, **kwargs):
        if hp.Boolean("normalize"):
            x = layers.Normalization()(x)
        return model.fit(
            x,
            y,
            # Tune whether to shuffle the data in each epoch.
            shuffle=hp.Boolean("shuffle"),
            **kwargs,
        )



## Model Tuning

In [61]:

input_shape = x_train_scaled.shape[1]  # Number of features in the dataset

hp = keras_tuner.HyperParameters()
hypermodel = MyHyperModel(input_shape=input_shape)

tuner = RandomSearch(
    hypermodel=hypermodel,
    objective="val_accuracy",
    max_trials=10,
    executions_per_trial=3,
    overwrite=True,
    directory="models",
    project_name="sst",
)


In [62]:
tuner.search_space_summary()

Search space summary
Default search space size: 5
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 3, 'step': 1, 'sampling': 'linear'}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
activation (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'tanh'], 'ordered': False}
dropout (Boolean)
{'default': False, 'conditions': []}
lr (Float)
{'default': 0.0001, 'conditions': [], 'min_value': 0.0001, 'max_value': 0.01, 'step': None, 'sampling': 'log'}


In [63]:
log_dir = "logs/tune/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

tuner.search(x_train_scaled, y_train, epochs=2, validation_split=0.2, callbacks=[tensorboard_callback, early_stopping])

# Get the top 2 models.
models = tuner.get_best_models(num_models=2)
best_model = models[0]

# Build the model.
best_model.build(input_shape=input_shape)
best_model.summary()

tuner.results_summary()

# Get the top 2 hyperparameters.
best_hps = tuner.get_best_hyperparameters(2)

# Build the model with the best hp
input_shape = x_train_scaled.shape[1]
model = hypermodel.build(best_hps[0])


# Fit with the entire dataset.
# Update these lines to match your refactored variable names
x_all = np.concatenate((x_train_scaled, x_train_scaled))  # Assuming x_val_scaled exists
y_all = np.concatenate((y_train, y_train))  # Assuming y_val exists

log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
model.fit(x=x_all, y=y_all, epochs=10, callbacks=[tensorboard_callback])


Trial 10 Complete [00h 00m 05s]
val_accuracy: 0.7341383894284567

Best val_accuracy So Far: 0.7906842827796936
Total elapsed time: 00h 00m 58s
INFO:tensorflow:Oracle triggered exit
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 192)               3648      
                                                                 
 dense_1 (Dense)             (None, 288)               55584     
                                                                 
 dropout (Dropout)           (None, 288)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 289       
                                                                 
Total params: 59521 (232.50 KB)
Trainable params: 59521 (232.50 KB)
Non-trainable params: 0 (0.00 Byte)
___________________________________

<keras.src.callbacks.History at 0x14a50b790>

## Evaluate Model

### Evaluate model tuning

In [67]:
%tensorboard --logdir logs/

Reusing TensorBoard on port 6006 (pid 93337), started 0:04:02 ago. (Use '!kill 93337' to kill it.)

## Make Predictions

In [65]:
# Make predictions on the test set (scaled)
y_test_pred = model.predict(x_test_scaled)
# Make predictions on the train set (scaled)
y_train_pred = model.predict(x_train_scaled)






## Create Submission

In [66]:
# Create the submission DataFrame
submission_df = create_submission_dataframe(y_test_pred, test["PassengerId"])

# Set Transported to boolean
submission_df["Transported"] = submission_df["Transported"].apply(
    lambda x: True if x == 1 else False
)
# To save the DataFrame as a CSV file
submission_df.to_csv("submission.csv", index=False)
