# Import

In [1]:
import torch
import numpy as np
import pytorch_lightning as pl
from torch import nn
from torch import optim
from sklearn.model_selection import train_test_split

from models.baseline import Seq2Seq, Encoder, Decoder
from data_utils.dataset import TranslationDataset
from data_utils.lang import read_langs, PAD
from pl_utils.pl_model import ModelWrapper
from pl_utils.pl_dataset import PlTranslationDataset

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
TEST_SHARE = 0.2

%load_ext autoreload
%autoreload 2

# Data Prep

In [2]:
with open("data.txt", 'r') as flines:
    all_lines = np.array(flines.readlines())

test_size = int(TEST_SHARE * len(all_lines))
train_size = len(all_lines) - test_size

train_lines, val_lines = train_test_split(all_lines, test_size=TEST_SHARE, random_state=42)


In [3]:
EN_LANG, RU_LANG, _ = read_langs("en", "ru", list(train_lines))

train_dataset = TranslationDataset(list(train_lines), EN_LANG, RU_LANG)
val_dataset = TranslationDataset(list(val_lines), EN_LANG, RU_LANG)
test_dataset = TranslationDataset(list(all_lines), EN_LANG, RU_LANG)


# Model Train

In [4]:
INPUT_DIM = len(EN_LANG.vocab)
OUTPUT_DIM = len(RU_LANG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

In [11]:
model = Seq2Seq(enc, dec, DEVICE)

In [12]:
def optimizer_fn(model: nn.Module):
    return optim.Adam(model.parameters(), lr=5e-3)

criterion = nn.CrossEntropyLoss(ignore_index=RU_LANG.vocab.get_stoi()[PAD])
def criterion_fn(translation, target):
    out = translation.view(-1, translation.shape[-1])
    exp = target.view(-1)
    return criterion(out, exp)

# criterion_fn = nn.CrossEntropyLoss(ignore_index=RU_LANG.vocab.get_stoi()[PAD])

In [7]:
%load_ext tensorboard

In [13]:
pl_model = ModelWrapper(model, criterion_fn, optimizer_fn)
pl_dataset = PlTranslationDataset(train_dataset, val_dataset, test_dataset, 128, 128)
checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_loss")

trainer = pl.Trainer(
    max_epochs=30,
    accelerator="gpu",
    devices=1,
    callbacks=[
        pl.callbacks.early_stopping.EarlyStopping(monitor="val_loss", patience=5),
        pl.callbacks.LearningRateMonitor(logging_interval="step"),
        checkpoint_callback
    ]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
torch.set_float32_matmul_precision('medium')
trainer.fit(
    pl_model, 
    pl_dataset,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type    | Params
----------------------------------
0 | model | Seq2Seq | 20.8 M
----------------------------------
20.8 M    Trainable params
0         Non-trainable params
20.8 M    Total params
83.140    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00019: reducing learning rate of group 0 to 5.0000e-04.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00029: reducing learning rate of group 0 to 5.0000e-05.


Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.


# BLEU

In [18]:
best = ModelWrapper.load_from_checkpoint(
    checkpoint_callback.best_model_path,
    model=model,
    criterion_fn=criterion_fn,
    optimizer_fn=optimizer_fn,
)

# best = ModelWrapper.load_from_checkpoint(
#     "/workspaces/nlp_made/src/lightning_logs/version_0/checkpoints/epoch=13-step=4382.ckpt",
#     model=model,
#     criterion_fn=criterion_fn,
#     optimizer_fn=optimizer_fn,
# )

dataloader = pl_dataset.test_dataloader()

In [47]:
from tqdm.notebook import tqdm
best.eval()
generated_corpa = []
target_corpa = []
batch_first = False

with torch.no_grad():
    for (source, target) in tqdm(dataloader):
        translation = best.forward((source.to(DEVICE), target.to(DEVICE)), 0, teacher_forcing_ratio=0)
        translation = translation.argmax(dim=-1).cpu().numpy()

        if not batch_first:
            translation = translation.T
            target = target.T

        for gen, orig in zip(translation, target):
            dec_gen = pl_dataset.target_lang.decode(gen)
            dec_orig = pl_dataset.target_lang.decode(orig)
            generated_corpa.append(dec_gen)
            target_corpa.append([dec_orig])
        break


  0%|          | 0/391 [00:00<?, ?it/s]

In [48]:
target_corpa[0]

[['отель',
  'расположен',
  'в',
  'тбилиси',
  'в',
  '3',
  'минутах',
  'ходьбы',
  'от',
  'свято',
  'троицкого',
  'собора']]

In [50]:
from nltk.translate.bleu_score import corpus_bleu

corpus_bleu(target_corpa, generated_corpa) * 100

9.40433747146796

In [46]:
generated_corpa[0]


[['отель',
  'расположен',
  'в',
  'в',
  'в',
  'в',
  'в',
  'минутах',
  'ходьбы',
  'от',
  'пляжа',
  'и',
  'в']]