# Import

In [1]:
import torch
import numpy as np
import pytorch_lightning as pl
from torch import nn
from torch import optim
from sklearn.model_selection import train_test_split

from models.baseline import Seq2Seq, Encoder, Decoder
from data_utils.dataset import TranslationDataset
from data_utils.lang import read_langs, PAD
from pl_utils.pl_model import ModelWrapper
from pl_utils.pl_dataset import PlTranslationDataset

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
TEST_SHARE = 0.2

torch.cuda.empty_cache()

%load_ext autoreload
%autoreload 2

In [1]:
!wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/datasets/Machine_translation_EN_RU/data.txt -nc

--2023-05-24 01:13:34--  https://raw.githubusercontent.com/neychev/made_nlp_course/master/datasets/Machine_translation_EN_RU/data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12905334 (12M) [text/plain]
Saving to: ‘data.txt’


2023-05-24 01:13:34 (111 MB/s) - ‘data.txt’ saved [12905334/12905334]



# Data Prep

In [2]:
with open("data.txt", 'r') as flines:
    all_lines = np.array(flines.readlines())

test_size = int(TEST_SHARE * len(all_lines))
train_size = len(all_lines) - test_size

train_lines, val_lines = train_test_split(all_lines, test_size=TEST_SHARE, random_state=42)
val_lines, test_lines = train_test_split(val_lines, test_size=TEST_SHARE, random_state=42)


In [3]:
EN_LANG, RU_LANG, _ = read_langs("en", "ru", list(train_lines))

train_dataset = TranslationDataset(list(train_lines), EN_LANG, RU_LANG)
val_dataset = TranslationDataset(list(val_lines), EN_LANG, RU_LANG)
test_dataset = TranslationDataset(list(test_lines), EN_LANG, RU_LANG)


# Model Train

In [4]:
INPUT_DIM = len(EN_LANG.vocab)
OUTPUT_DIM = len(RU_LANG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

In [5]:
model = Seq2Seq(enc, dec, DEVICE)

In [6]:
def optimizer_fn(model: nn.Module):
    return optim.Adam(model.parameters(), lr=5e-4)

criterion = nn.CrossEntropyLoss(ignore_index=RU_LANG.vocab.get_stoi()[PAD])
def criterion_fn(translation, target):
    out = translation.view(-1, translation.shape[-1])
    exp = target.view(-1)
    return criterion(out, exp)

# criterion_fn = nn.CrossEntropyLoss(ignore_index=RU_LANG.vocab.get_stoi()[PAD])

In [7]:
%load_ext tensorboard

In [7]:
# pl_model = ModelWrapper(model, criterion_fn, optimizer_fn)
pl_dataset = PlTranslationDataset(train_dataset, val_dataset, test_dataset, 128, 128)

pl_model = ModelWrapper.load_from_checkpoint(
    "/workspaces/nlp_made/src/lightning_logs/version_0/checkpoints/epoch=10-step=3443.ckpt",
    model=model,
    criterion_fn=criterion_fn,
    optimizer_fn=optimizer_fn,
)

checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_loss")

trainer = pl.Trainer(
    max_epochs=50,
    accelerator="gpu",
    devices=1,
    callbacks=[
        pl.callbacks.early_stopping.EarlyStopping(monitor="val_loss", patience=5),
        pl.callbacks.LearningRateMonitor(logging_interval="step"),
        checkpoint_callback
    ]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [8]:
torch.set_float32_matmul_precision('medium')

trainer.fit(
    pl_model, 
    pl_dataset,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type    | Params
----------------------------------
0 | model | Seq2Seq | 20.8 M
----------------------------------
20.8 M    Trainable params
0         Non-trainable params
20.8 M    Total params
83.140    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00005: reducing learning rate of group 0 to 5.0000e-05.


Validation: 0it [00:00, ?it/s]

# BLEU

In [10]:
best = ModelWrapper.load_from_checkpoint(
    checkpoint_callback.best_model_path,
    model=model,
    criterion_fn=criterion_fn,
    optimizer_fn=optimizer_fn,
)

# best = ModelWrapper.load_from_checkpoint(
#     "/workspaces/nlp_made/src/lightning_logs/version_1/checkpoints/epoch=24-step=7825.ckpt",
#     model=model,
#     criterion_fn=criterion_fn,
#     optimizer_fn=optimizer_fn,
# )

dataloader = pl_dataset.test_dataloader()

In [11]:
from utils import calc_blue

calc_blue(best, pl_dataset, DEVICE)

  0%|          | 0/16 [00:00<?, ?it/s]

Generated sample:  отель в в в от пляжа
Target sample:  апарт отель all suites appart hotel расположен в 5 минутах езды от аэропорта бордо


8.578765058019606