In [1]:
import pandas as pd
import numpy as np

# Import Dataset 

In [2]:
path = './spaceship-titanic/'
data = pd.read_csv(path + 'train.csv')

In [3]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# Data-Preprocess

**PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

**HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.

    We are going to use hot encoding here
    Nan value is replace with the mode

**CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

    We are going to use binary encoding
    Nan value is replace with the mode

**Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

    We are going to use hot encoding
    Nan value is replace with the mode
    
**Destination** - The planet the passenger will be debarking to.

    We are going to use hot encoding
    Nan value is replace with the mode

**Age** - The age of the passenger.

    Normalization MinMax
    Nan value is replace with the average

**VIP** - Whether the passenger has paid for special VIP service during the voyage.

    Binary Encoding
    Nan value is replace with the mode

**RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

    Normalization MinMax
    Nan values are replaced with the average

**Name** - The first and last names of the passenger.

    This feature is not necessary

**Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

    What we want to predict

### Pre-process (Nan Values)

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# fix NaN Value using the mode
imputer = SimpleImputer(strategy='most_frequent')
column_to_impute = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
data[column_to_impute] = imputer.fit_transform(data[column_to_impute])

# remove Name
data.drop(columns = ['Name', 'PassengerId', 'Cabin'], inplace=True)

# fix NaN Value using the mode
imputer = SimpleImputer(strategy='most_frequent')
column_to_impute = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
data[column_to_impute] = imputer.fit_transform(data[column_to_impute])

# fix numerical NaN value using the mean
imputer = SimpleImputer(strategy='mean')
column_to_impute = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
data[column_to_impute] = imputer.fit_transform(data[column_to_impute])

### Pre-process (Encoding)

In [5]:
# econding categorical data
hot_encoder_category = OneHotEncoder()
# columns_to_encode = ['HomePlanet', 'Cabin', 'Destination'] # probably the Cabin is too much for HotEncoding
columns_to_encode = ['HomePlanet', 'Destination']
encoded_data = hot_encoder_category.fit_transform(data[columns_to_encode])
encoded_df = pd.DataFrame(encoded_data.toarray(),
                          columns=hot_encoder_category.get_feature_names_out(columns_to_encode),
                          index = data.index)
data.drop(columns = columns_to_encode, inplace=True)
data = pd.concat([data, encoded_df], axis=1)

# encoding binary data (True = 1, False = 0)
columns_to_encode = ['CryoSleep', 'VIP']
data[columns_to_encode] = data[columns_to_encode].astype(float)

# normalized numerical data
scaler = MinMaxScaler()
columns_to_scale = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
scaled_data = scaler.fit_transform(data[columns_to_scale])

scaled_data = pd.DataFrame(scaled_data,
                            columns=columns_to_scale,
                            index=data.index)

for column in columns_to_scale:
    data[column] = scaled_data[column]

### Create dataset for pytorch

In [6]:
import torch
from torch.utils.data import Dataset

# define the Dataset class to be used for pytorch training

class Data(Dataset):

    def __init__(self, 
                 df: pd.DataFrame,
                 device: torch.device = torch.device('cpu')):
        self.device = device
    
        self.X = df.iloc[:, :-1]
        self.Y = df.iloc[:, [-1]].astype(int) # True = 1, False = 0
    
    def __getitem__(self, i:int) -> (torch.tensor, torch.tensor):
        X = self.X.iloc[i].values.astype(np.float32)
        X = torch.tensor(X, device=self.device)
        Y = self.Y.iloc[i].values.astype(np.float32)
        Y = torch.tensor(Y, device=self.device)
        # change dtype to torch.float32 (same for the labels)
        return X, Y
    
    def __len__(self):
        return len(self.X)

## Split the data into train and test set

In [7]:
from torch.utils.data import DataLoader

seed = 42
np.random.seed(seed)
# shuffle the index of the dataset
idx = np.random.permutation(data.index)
# split the dataset into train and test set
train_idx = idx[:int(len(idx) * 0.8)]
test_idx = idx[int(len(idx) * 0.8):]
# create the train and test set
train_data = data.iloc[train_idx].reset_index(drop=True)
test_data = data.iloc[test_idx].reset_index(drop=True)
# create the dataset for pytorch
# set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_dataset = Data(train_data, device=device)
test_dataset = Data(test_data, device=device)
# create the dataloader for pytorch
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)

# Define the neural network

We create a simple neural network with 3 layers, the first one with 100 neurons, the second one with 50 neurons and the last one with 1 neuron. The first two layers use the LeakyReLU activation function, while the last one uses the Sigmoid activation function.

In [8]:
import torch
import torch.nn as nn

class BinaryClassificationNN(nn.Module):
    def __init__(self, input_size, hidden_size=100):
        super(BinaryClassificationNN, self).__init__()

        # Define the layers
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.LeakyReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu2 = nn.LeakyReLU()
        self.fc3 = nn.Linear(hidden_size // 2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        
        # Forward pass through the network
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        out = self.sigmoid(out)

        return out

# Train the neural network

### Initialize the neural network

In [9]:
dim_input = train_dataset[0][0].shape[0]
# Create an instance of the binary classification neural network
model = BinaryClassificationNN(input_size=dim_input)
model.to(device)

# Define the loss function (usually Binary Cross-Entropy for binary classification)
criterion = nn.BCELoss()

# Define the optimizer (usually Adam or SGD)
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Define the train function

In [10]:
def train(model, dataloader, criterion, optimizer, num_epochs, seed=42):
    """
    Train a PyTorch model for binary classification.

    Args:
        model (nn.Module): The PyTorch model to train.
        dataloader (DataLoader): DataLoader for your training dataset.
        criterion (nn.Module): The loss function (e.g., BCELoss for binary classification).
        optimizer (torch.optim.Optimizer): The optimizer (e.g., Adam, SGD).
        num_epochs (int): The number of training epochs.
        seed (int): The random seed for reproducibility.

    Returns:
        None (modifies the model in-place).
    """
    model.train()  # Set the model to training mode

    for epoch in range(num_epochs):
        running_loss = 0.0
        
        for inputs, labels in dataloader:
            # to get the same learning result
            torch.manual_seed(seed + epoch) 
            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            
            # Compute the loss
            loss = criterion(outputs, labels)

            # Backpropagation and optimization
            loss.backward()
            optimizer.step()

            # Update the running loss
            running_loss += loss.item()

        # Print the average loss for this epoch
        epoch_loss = running_loss / len(dataloader)
        print(f"Epoch [{epoch + 1}/{num_epochs}] Loss: {epoch_loss:.4f}")

    print("Training complete!")

# Usage example:
# train(model, train_dataloader, criterion, optimizer, device, num_epochs)

### Training

In [11]:
num_epochs = 10
train(model, train_loader, criterion, optimizer, num_epochs, seed=seed)

Epoch [1/10] Loss: 0.2078
Epoch [2/10] Loss: 0.0012
Epoch [3/10] Loss: 0.0003
Epoch [4/10] Loss: 0.0001
Epoch [5/10] Loss: 0.0001
Epoch [6/10] Loss: 0.0000
Epoch [7/10] Loss: 0.0000
Epoch [8/10] Loss: 0.0000
Epoch [9/10] Loss: 0.0000
Epoch [10/10] Loss: 0.0000
Training complete!


# Evaluate the model

In [12]:
def evaluate(model, dataloader, criterion):
    """
    Evaluate a PyTorch model for binary classification.

    Args:
        model (nn.Module): The PyTorch model to evaluate.
        dataloader (DataLoader): DataLoader for your evaluation dataset.
        criterion (nn.Module): The loss function (e.g., BCELoss for binary classification).
        device (torch.device): The device on which to run the evaluation.

    Returns:
        float: The average loss over the evaluation dataset.
        float: The accuracy over the evaluation dataset.
    """
    model.eval()  # Set the model to evaluation mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():  # Deactivate gradients for the following code
        for inputs, labels in dataloader:

            # Forward pass and loss computation
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            # Convert the probabilities to integers (0 or 1)
            predicted_classes = (outputs >= 0.5).int().float()

            # Count the correct predictions
            correct_predictions += (predicted_classes == labels).sum().item()
            total_predictions += len(labels)

    # Compute the loss and accuracy
    avg_loss = running_loss / len(dataloader)
    acc = correct_predictions / total_predictions

    print(f"Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}")

    return avg_loss, acc

In [13]:
evaluate(model, test_loader, criterion)

Loss: 0.0000, Accuracy: 1.0000


(1.0121378434989997e-05, 1.0)

# Evaluate on the submition test dataset

In [14]:
test_data = pd.read_csv(path + 'test.csv')
passenger_id = test_data['PassengerId']

### Pre-process (Nan Values)

In [15]:
from sklearn.impute import SimpleImputer

# fix NaN Value using the mode
imputer = SimpleImputer(strategy='most_frequent')
column_to_impute = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
test_data[column_to_impute] = imputer.fit_transform(test_data[column_to_impute])

# remove Name
test_data.drop(columns = ['Name', 'PassengerId', 'Cabin'], inplace=True)

# fix NaN Value using the mode
imputer = SimpleImputer(strategy='most_frequent')
column_to_impute = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
test_data[column_to_impute] = imputer.fit_transform(test_data[column_to_impute])

# fix numerical NaN value using the mean
imputer = SimpleImputer(strategy='mean')
column_to_impute = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
test_data[column_to_impute] = imputer.fit_transform(test_data[column_to_impute])

### Pre-process (Encoding)

Use the same encoded used for the train dataset

In [16]:
# econding categorical data
columns_to_encode = ['HomePlanet', 'Destination'] # probably the Cabin is too much for HotEncoding
encoded_data = hot_encoder_category.transform(test_data[columns_to_encode])
encoded_df = pd.DataFrame(encoded_data.toarray(),
                          columns=hot_encoder_category.get_feature_names_out(columns_to_encode),
                          index = test_data.index)

test_data.drop(columns = columns_to_encode, inplace=True)
test_data = pd.concat([test_data, encoded_df], axis=1)

# encoding binary data (True = 1, False = 0)
columns_to_encode = ['CryoSleep', 'VIP']
test_data[columns_to_encode] = test_data[columns_to_encode].astype(float)

# normalized numerical data
columns_to_scale = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
scaled_data = scaler.transform(test_data[columns_to_scale])

scaled_data = pd.DataFrame(scaled_data,
                           columns=columns_to_scale,
                           index=test_data.index)

for column in columns_to_scale:
    test_data[column] = scaled_data[column]

### Create dataset for pytorch

In [17]:
class PredictData(Dataset):

    def __init__(self,
                 df: pd.DataFrame,
                 device: torch.device = torch.device('cpu')):
        self.device = device

        self.X = df.iloc[:, :]

    def __getitem__(self, i:int) -> (torch.tensor, torch.tensor):
        X = self.X.iloc[i].values.astype(np.float32)
        X = torch.tensor(X, device=self.device)
        # change dtype to torch.float32 (same for the labels)
        return X

    def __len__(self):
        return len(self.X)

In [18]:
test_dataset = PredictData(test_data, device=device)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)

### Create dataset to send

In [19]:
def create_dataset_prediction(model, dataloader, indices):
    
    model.eval()  # Set the model to evaluation mode
    predictions = {}
    with torch.no_grad():  # Deactivate gradients for the following code
        for inputs, id in zip(dataloader, indices):
            inputs = inputs[0]
            # Forward pass and loss computation
            outputs = model(inputs)
            
            # Convert the probabilities to integers (0 or 1)
            predicted_classes = (outputs >= 0.5).int()
            
            predictions[id] = bool(predicted_classes)

    return predictions

In [20]:
predictions = create_dataset_prediction(model, test_loader, passenger_id)

### Create the submission file

In [21]:
predictions_df = pd.DataFrame(predictions.items(), columns=['PassengerId', 'Transported'])
predictions_df.set_index('PassengerId', inplace=True)

In [22]:
predictions_df

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,False
0018_01,False
0019_01,False
0021_01,True
0023_01,True
...,...
9266_02,False
9269_01,True
9271_01,False
9273_01,False


### Save the submission file

In [23]:
predictions_df.to_csv('submission.csv')