In [4]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import torch
from torch.utils.data import (
    DataLoader,
    TensorDataset,
    Dataset
)

from torch import nn
from torchinfo import summary
from torch.optim import (Optimizer, Adam)
from torch.nn.functional import cross_entropy
from torchmetrics import Accuracy

import matplotlib.pyplot as plt

In [5]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

filepath = "Titanic-Dataset.csv"

df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "yasserh/titanic-dataset",
    filepath
)

In [6]:
"""

We want to separate out the factors we want to keep.
As well as clean them up into workable datatypes.


Columns:
    Survived: Whether or not the passenger survived.
    Pclass: The passenger's class.
    Sex: The passenger's sex.
    Age: The passenger's age.
    SibSp: # of siblings / spouses aboard.
    Parch: # of parents / children aboard.
    
"""

df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Survived']]

In [7]:
# Convert gender to a number
df = df.replace("male", 1)
df = df.replace("female", 2)

# Replace NaN
df['Age'] = df['Age'].fillna(df['Age'].mean())



df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Survived
0,3,1,22.000000,1,0,0
1,1,2,38.000000,1,0,1
2,3,2,26.000000,0,0,1
3,1,2,35.000000,1,0,1
4,3,1,35.000000,0,0,0
...,...,...,...,...,...,...
886,2,1,27.000000,0,0,0
887,1,2,19.000000,0,0,1
888,3,2,29.699118,1,2,0
889,1,1,26.000000,0,0,1


In [8]:
def make_dataset(df: pd.DataFrame) -> Dataset:
    features = df.iloc[:,:-1].values
    targets = df.iloc[:, -1].values

    f_t = torch.tensor(features, dtype=torch.float32)
    t_t = torch.tensor(targets, dtype=torch.int64)

    return TensorDataset(f_t, t_t)

In [9]:
train_dataset = make_dataset(df)
(x, y) = train_dataset[0]
print(f"First input {x}. Its dtype must be {x.dtype}.")
print(f"First output {y}. Its dtype must be {y.dtype}.")

First input tensor([ 3.,  1., 22.,  1.,  0.]). Its dtype must be torch.float32.
First output 0. Its dtype must be torch.int64.


In [10]:
def make_dataloader(dataset: Dataset, batch_size: int, shuffle: bool) -> DataLoader:
    return DataLoader(dataset, batch_size, shuffle)

In [11]:
train_dataloader = make_dataloader(train_dataset, shuffle=False, batch_size=5)
first_batch = next(iter(train_dataloader))
first_batch

[tensor([[ 3.,  1., 22.,  1.,  0.],
         [ 1.,  2., 38.,  1.,  0.],
         [ 3.,  2., 26.,  0.,  0.],
         [ 1.,  2., 35.,  1.,  0.],
         [ 3.,  1., 35.,  0.,  0.]]),
 tensor([0, 1, 1, 1, 0])]

In [12]:
categorical_features = ['Pclass', 'Sex']
continuous_features = ['Age', 'SibSp', 'Parch']

df_cat = pd.get_dummies(df[categorical_features], columns=categorical_features)
df_cont = df[continuous_features].fillna(df[continuous_features].mean())


features = pd.concat([df_cat, df_cont], axis=1)


features = features.astype(float)


features = torch.tensor(features.values, dtype=torch.float32)

ae_dataset = TensorDataset(features)
ae_dataloader = DataLoader(ae_dataset, batch_size=891, shuffle=True)
input_dim = features.shape[1]

In [13]:
class TabularAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=8):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, latent_dim),
        )

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat

In [14]:
cat_indices = list(range(df_cat.shape[1]))
cont_indices = list(range(df_cat.shape[1], input_dim))

def loss_fn(x_hat, x):
    loss_cat = nn.MSELoss()(x_hat[:, cat_indices], x[:, cat_indices])
    loss_cont = nn.MSELoss()(x_hat[:, cont_indices], x[:, cont_indices])
    return loss_cat + loss_cont

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = TabularAutoencoder(input_dim=8, latent_dim=8).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

Model's state_dict:
encoder.0.weight 	 torch.Size([16, 8])
encoder.0.bias 	 torch.Size([16])
encoder.2.weight 	 torch.Size([8, 16])
encoder.2.bias 	 torch.Size([8])
decoder.0.weight 	 torch.Size([16, 8])
decoder.0.bias 	 torch.Size([16])
decoder.2.weight 	 torch.Size([8, 16])
decoder.2.bias 	 torch.Size([8])
Optimizer's state_dict:
state 	 {}
param_groups 	 [{'lr': 0.001, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'decoupled_weight_decay': False, 'params': [0, 1, 2, 3, 4, 5, 6, 7]}]


In [16]:
num_epochs = 1000

for epoch in range(num_epochs):
    total_loss = 0
    for batch in ae_dataloader:
        x = batch[0].to(device)

        x_hat = model(x)
        loss = loss_fn(x_hat, x)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x.size(0)
    avg_loss = total_loss / len(ae_dataloader.dataset)
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1} / {num_epochs}], Loss: {avg_loss:.4f}")

Epoch [10 / 1000], Loss: 336.9433
Epoch [20 / 1000], Loss: 325.0435
Epoch [30 / 1000], Loss: 313.2594
Epoch [40 / 1000], Loss: 298.0249
Epoch [50 / 1000], Loss: 275.3821
Epoch [60 / 1000], Loss: 243.9776
Epoch [70 / 1000], Loss: 202.6965
Epoch [80 / 1000], Loss: 149.2210
Epoch [90 / 1000], Loss: 88.4358
Epoch [100 / 1000], Loss: 32.9031
Epoch [110 / 1000], Loss: 3.0105
Epoch [120 / 1000], Loss: 3.1990
Epoch [130 / 1000], Loss: 2.3420
Epoch [140 / 1000], Loss: 1.1653
Epoch [150 / 1000], Loss: 1.2757
Epoch [160 / 1000], Loss: 1.1464
Epoch [170 / 1000], Loss: 1.0822
Epoch [180 / 1000], Loss: 1.0522
Epoch [190 / 1000], Loss: 1.0041
Epoch [200 / 1000], Loss: 0.9393
Epoch [210 / 1000], Loss: 0.8782
Epoch [220 / 1000], Loss: 0.8198
Epoch [230 / 1000], Loss: 0.7724
Epoch [240 / 1000], Loss: 0.7351
Epoch [250 / 1000], Loss: 0.7016
Epoch [260 / 1000], Loss: 0.6725
Epoch [270 / 1000], Loss: 0.6473
Epoch [280 / 1000], Loss: 0.6234
Epoch [290 / 1000], Loss: 0.6013
Epoch [300 / 1000], Loss: 0.5813
E

In [17]:
x_sample = features[:5].to(device)
x_recon = model(x_sample).detach().cpu()
print("Original:\n", x_sample.cpu().numpy())
print("Reconstructed:\n", x_recon.numpy())

Original:
 [[ 0.  0.  1.  1.  0. 22.  1.  0.]
 [ 1.  0.  0.  0.  1. 38.  1.  0.]
 [ 0.  0.  1.  0.  1. 26.  0.  0.]
 [ 1.  0.  0.  0.  1. 35.  1.  0.]
 [ 0.  0.  1.  1.  0. 35.  0.  0.]]
Reconstructed:
 [[ 1.9535229e-02  1.0690689e-01  7.6592076e-01  8.7417877e-01
   7.7752918e-02  2.2007210e+01  1.1243448e+00 -1.5469214e-01]
 [ 5.5185682e-01  1.9552022e-01  2.5875437e-01  4.1782889e-01
   5.0919271e-01  3.7988873e+01  6.9205892e-01  3.7391475e-01]
 [ 2.4093412e-01  2.2877170e-01  4.3820727e-01  5.7206380e-01
   3.2854709e-01  2.6040157e+01  1.0732593e-01  1.7964843e-01]
 [ 5.2107781e-01  1.9795808e-01  2.6734412e-01  4.1597924e-01
   5.0113964e-01  3.4989140e+01  6.8918097e-01  3.8202527e-01]
 [ 1.1334042e-01  1.7132020e-01  7.0810759e-01  9.3041790e-01
   1.0158250e-01  3.4996452e+01  1.5701617e-01 -1.1570833e-01]]


In [18]:
pred_pclass = x_recon[:, :3].argmax(dim=1)
pred_sex = x_recon[:, 3:5].argmax(dim=1)