In [1]:
# Titanic ML task by Vasyl Storchak

In [1]:
import platform
print(platform.python_version())

3.11.0rc1


In [107]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import pytorch_lightning as pl

from torch import nn
from torchvision.datasets import MNIST
from torchvision import transforms
from torch.utils.data import DataLoader, random_split, Dataset
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold

In [103]:
device = 'cuda'
# wtf ?
batch_size = 512
precision_model = '64' #'bf16-mixed'

data_precision_torch = torch.float32
match precision_model:
    case 'bf16-mixed': 
        data_precision_torch = torch.float16
    case '16': 
        data_precision_torch = torch.float16
    case '32-mixed': 
        data_precision_torch = torch.float32
    case '32': 
        data_precision_torch = torch.float32
    case '64': 
        data_precision_torch = torch.float64
    
num_workers = 16
epochs = 100
# name_batch_precision_epochs
model_name = f'titanic_{str(batch_size)}_{str(precision_model)}_{str(epochs)}.pt'
data_path = "titanic/train.csv"
test_path = "titanic/test.csv"
test_path_surv = "titanic/gender_submission.csv"

In [50]:
model_name

'titanic_1_64_15.pt'

In [63]:
class IsTitanicPassengerDead(pl.LightningModule):
    def __init__(self):
        super().__init__()
        # TODO rewrite to binary answers
        self.layer_1 = nn.Linear(6, 128)
        self.layer_2 = nn.Linear(128, 256)
        self.layer_3 = nn.Linear(256, 512)
        self.layer_4 = nn.Linear(512, 1)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        # check - relu or leakyrelu
        x = F.leaky_relu(self.layer_1(x))
        x = F.leaky_relu(self.layer_2(x))
        x = F.leaky_relu(self.layer_3(x))
        return self.layer_4(x).squeeze(1)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.binary_cross_entropy_with_logits(logits, y)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        preds = torch.sigmoid(logits) > 0.5
        acc = (preds.float() == y).float().mean()
        self.log("val_acc", acc, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

In [52]:
class TitanicDataset(Dataset):
    def __init__(self, data, targets, data_precision=torch.float64):
        self.data = torch.tensor(data.values, dtype=data_precision)
        self.targets = torch.tensor(targets.values, dtype=data_precision)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [53]:
class IsTitanicPassengerDeadDataModule(pl.LightningDataModule):
    def __init__(self, 
                 data_path="titanic/train.csv", 
                 test_path="titanic/test.csv",
                 test_path_surv="titanic/gender_submission.csv",
                 batch_size=64, 
                 num_workers=8, 
                 n_splits=5, 
                 fold_idx=0,
                 shuffle_train=True,
                 data_precision=torch.float64):
        super().__init__()
        self.data_path = data_path
        self.test_path = test_path
        self.test_path_surv = test_path_surv
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.n_splits = n_splits
        self.fold_idx = fold_idx
        self.shuffle_train = shuffle_train
        self.data_precision = data_precision

    def prepare_data(self):
        pd.read_csv(self.data_path)
        pd.read_csv(self.test_path)

    def setup(self, stage=None):
        # setup train data
        df = pd.read_csv(self.data_path)
        # TITLE => PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
        df = df[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
        df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
        df["Age"] = df["Age"].fillna(df["Age"].median())
        df["Fare"] = df["Fare"].fillna(df["Fare"].median())
        # df["Pclass"] = (df["Pclass"] - df["Pclass"].min()) / (df["Pclass"].max() - df["Pclass"].min())

        # setup target
        X = df.drop(columns=["Survived"])
        y = df["Survived"]

        # FIXME crossvalidation K-Fold ?
        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        splits = list(kf.split(X))
        train_idx, val_idx = splits[self.fold_idx]      
        self.train_dataset = TitanicDataset(X.iloc[train_idx], y.iloc[train_idx], data_precision=self.data_precision)
        self.val_dataset = TitanicDataset(X.iloc[val_idx], y.iloc[val_idx], data_precision=self.data_precision)
        
        # setup test data
        test_df = pd.read_csv(self.test_path)
        
        test_df = test_df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
        test_df["Sex"] = test_df["Sex"].map({"male": 0, "female": 1})
        test_df["Age"] = test_df["Age"].fillna(df["Age"].median())
        test_df["Fare"] = test_df["Fare"].fillna(df["Fare"].median())

        self.test_dataset = torch.tensor(test_df.values, dtype=self.data_precision)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=self.shuffle_train)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

In [108]:
model = IsTitanicPassengerDead()
model.load_state_dict(torch.load(model_name)) #'titanic_256_32_50.pt'))

<All keys matched successfully>

In [110]:
model = IsTitanicPassengerDead()
logger = TensorBoardLogger("tb_logs", name="my_model")

checkpoint_callback = ModelCheckpoint(
    monitor="val_acc",         
    dirpath="checkpoints/",     
    filename="best-checkpoint",  
    save_top_k=3,              
    mode="min"                    
)

dm = IsTitanicPassengerDeadDataModule(
    batch_size=batch_size, 
    num_workers=num_workers,
    data_path=data_path, 
    test_path=test_path,
    data_precision=data_precision_torch
)

trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback],
    max_epochs=epochs, 
    accelerator='gpu' if torch.cuda.is_available() and device == 'cuda' else 'cpu',
    precision=precision_model,
    detect_anomaly=True,
    accumulate_grad_batches=batch_size
)
trainer.fit(model, datamodule=dm)

torch.save(model.state_dict(), model_name)
print(f"Model saved to {model_name}")

You have turned on `Trainer(detect_anomaly=True)`. This will significantly slow down compute speed and is recommended only for model debugging.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type   | Params | Mode 
-------------------------------------------
0 | layer_1 | Linear | 896    | train
1 | layer_2 | Linear | 33.0 K | train
2 | layer_3 | Linear | 131 K  | train
3 | layer_4 | Linear | 513    | train
-------------------------------------------
166 K     Trainable params
0         Non-trainable params
166 K     Total params
0.664     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


                                                                                                                  

/home/forever/.local/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0: 100%|█████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 48.58it/s, v_num=8]
Validation: |                                                                               | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                                                           | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 189.68it/s][A
Epoch 1: 100%|██████████████████████████████████████████████| 2/2 [00:00<00:00,  4.11it/s, v_num=8, val_acc=0.765][A
Validation: |                                                                               | 0/? [00:00<?, ?it/s][A
Validation:   0%|                                                                           | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                             

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|█████████████████████████████████████████████| 2/2 [00:00<00:00,  2.21it/s, v_num=8, val_acc=0.810]
Model saved to titanic_512_64_100.pt


In [112]:
model = IsTitanicPassengerDead()
model.load_state_dict(torch.load(model_name))
model.to(device)
model.eval()

IsTitanicPassengerDead(
  (layer_1): Linear(in_features=6, out_features=128, bias=True)
  (layer_2): Linear(in_features=128, out_features=256, bias=True)
  (layer_3): Linear(in_features=256, out_features=512, bias=True)
  (layer_4): Linear(in_features=512, out_features=1, bias=True)
)

In [70]:
df = pd.read_csv(data_path)
df = df[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Fare"] = df["Fare"].fillna(df["Fare"].median())

# setup target
X = df.drop(columns=["Survived"])
y = df["Survived"]

# crossvalidation K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
splits = list(kf.split(X))
train_idx, val_idx = splits[0]
train_dataset = TitanicDataset(X.iloc[train_idx], y.iloc[train_idx], data_precision=data_precision_torch)

test_df = pd.read_csv(test_path)
test_df_surv = pd.read_csv('titanic/gender_submission.csv')
test_df = test_df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
test_df["Sex"] = test_df["Sex"].map({"male": 0, "female": 1})
test_df["Age"] = test_df["Age"].fillna(df["Age"].median())
test_df["Fare"] = test_df["Fare"].fillna(df["Fare"].median())
test_dataset = torch.tensor(test_df.values, dtype=data_precision_torch)

test_loader = DataLoader(
    test_dataset, 
    batch_size=1, 
    num_workers=num_workers
)

In [44]:
# simple test
# 0 -> [3, 0, 22, 1, 0, 7.25]
# 1 -> [1, 1, 38, 1, 0, 71.2833]
with torch.no_grad():
    out = model(torch.tensor([[3, 0, 22, 1, 0, 7.25]], device=device))
    preds = 0 if out < 0 else 1 #torch.argmax(out, dim=0)
    print(out)
    print(preds)

tensor([-3.6630], device='cuda:0')
0


In [113]:
# make metrics
y_true = []
y_pred = []
idx = 0
theshold = 0.0

with torch.no_grad():
    for x in test_loader:
        x = x.to(device)
        x = x.type(torch.float32)
        outputs = model(x)
        preds = 0 if outputs < theshold else 1
        y_true.append(test_df_surv['Survived'][idx])
        y_pred.append(preds)
        idx = idx + 1

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))

Accuracy: 0.9234449760765551

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94       266
           1       0.88      0.91      0.90       152

    accuracy                           0.92       418
   macro avg       0.92      0.92      0.92       418
weighted avg       0.92      0.92      0.92       418


Confusion Matrix:
 [[248  18]
 [ 14 138]]
