# SSNE Miniproject 5
### 318703 Tomasz Owienko
### 318718 Anna Schäfer
### Grupa piątek

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import pickle
from torch.utils.data import DataLoader, Dataset, random_split
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, RichProgressBar
import matplotlib.pyplot as plt

In [2]:
RANDOM_SEED = 123
pl.seed_everything(RANDOM_SEED)

Global seed set to 123


123

In [3]:
device = torch.device("cuda")

In [4]:
VALIDATION_PERCENTAGE = 0.10
batch_size = 32
TRAIN_PATH = "data/train.pkl"
TEST_PATH = "data/test_no_target.pkl"

In [5]:
class VariableLenDataset(Dataset):
    def __init__(self, in_data, target):
        self.data = [(x, y) for x, y in zip(in_data, target)]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        in_data, target = self.data[idx]
        return in_data, target

In [6]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence


def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    xx = [x.clone().detach().unsqueeze(-1) for x in xx]
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    return (
        xx_pad,
        torch.tensor(yy, dtype=torch.long),
        torch.tensor(x_lens, dtype=torch.int).cpu(),
    )

In [7]:
with open(TRAIN_PATH, "rb") as f:
    train = pickle.load(f)

train_data = [[torch.from_numpy(t[0].astype(int)).float(), int(t[1])] for t in train]

In [8]:
dataset_length = len(train_data)
val_size = int(dataset_length * VALIDATION_PERCENTAGE)
train_size = dataset_length - val_size
train_subset, val_subset = random_split(train_data, [train_size, val_size])

train_dataset = VariableLenDataset(
    [x[0] for x in train_subset], [x[1] for x in train_subset]
)
val_dataset = VariableLenDataset([x[0] for x in val_subset], [x[1] for x in val_subset])

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=pad_collate,
    pin_memory=True,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=pad_collate,
    pin_memory=True,
)

In [9]:
print(train_subset[0][0].shape)

torch.Size([228])


In [10]:
_train_loader = DataLoader(
    train_subset, batch_size=32, shuffle=True, collate_fn=pad_collate
)
next(iter(_train_loader))[0].shape

torch.Size([32, 2515, 1])

In [11]:
class LSTMClassifier(pl.LightningModule):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.lstm.flatten_parameters()
        self.fc = nn.Linear(hidden_size, output_size)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.criterion = nn.CrossEntropyLoss()
        self.dropout = nn.Dropout(0.4)

    def forward(self, x, x_lens):
        x_packed = pack_padded_sequence(
            x, x_lens, batch_first=True, enforce_sorted=False
        )
        out, (hn, cn) = self.lstm(x_packed)
        # out = self.fc(out[:, -1, :])
        out = self.fc(self.dropout(hn[-1]))
        return out

    def training_step(self, batch, batch_idx):
        x, y, x_lens = batch
        outputs = self(x, x_lens.cpu())
        loss = self.criterion(outputs, y)
        acc = (outputs.argmax(dim=1) == y).float().mean()
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y, x_lens = batch
        outputs = self(x, x_lens.cpu())
        loss = self.criterion(outputs, y)
        acc = (outputs.argmax(dim=1) == y).float().mean()
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

In [12]:
input_size = train_data[0][0].shape[1] if len(train_data[0][0].shape) > 1 else 1
hidden_size = 512
output_size = len(set(x[1] for x in train_data))

model = LSTMClassifier(input_size=1, hidden_size=128, output_size=5, num_layers=3)

best_val_acc_callback = ModelCheckpoint(
    monitor="val_acc",
    filename="checkpoint_best_acc-{epoch:03d}-{val_acc:.5f}",
    save_top_k=3,
    mode="min",
)

last_epoch_callback = ModelCheckpoint(
    save_top_k=1,
    monitor="epoch",
    mode="max",
    filename="checkpoint_last-{epoch:03d}-{train_loss:.5f}",
)

trainer = pl.Trainer(
    max_epochs=20,
    callbacks=[best_val_acc_callback, last_epoch_callback, RichProgressBar()],
)
trainer.fit(model, train_loader, val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

In [None]:
with open(TEST_PATH, "rb") as f:
    test_data = pickle.load(f)

test_data = [torch.from_numpy(t.astype(int)).float().unsqueeze(-1) for t in test_data]
test_dataset = VariableLenDataset(test_data, [0 for _ in test_data])
test_loader = DataLoader(
    test_dataset, batch_size=50, shuffle=False, collate_fn=pad_collate
)