# Dataloading and datapreprocesing

In [None]:
ITERABLE = False # True is train Dataset is IterableDataset, False -- otherwise

In [1]:
%%capture
!pip install sentencepiece transformers
!pip install gdown
!pip install --upgrade --no-cache-dir gdown
!pip install pytorch-lightning

In [2]:
import re
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

from tqdm import tqdm

import pytorch_lightning as pl

import torch
import torch.nn as nn
from torch.utils.data import IterableDataset, Dataset, DataLoader

import gdown

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
from transformers import AutoTokenizer

model_name = 'cointegrated/LaBSE-en-ru'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

## Download preprocessed data

In [4]:
# %%capture
# !gdown https://drive.google.com/uc?id=1Imd9w580FKGAY4_SsJbpUvoLh_lsoHrc

In [None]:
en_train_data_path = 'en_data.txt'

if not ITERABLE:
    en_train_data = []

    with open(en_train_data_path, 'r') as file:
        for line in file:
            en_train_data.append(line)

## Validation data

In [5]:
%%capture
!wget -nc https://raw.githubusercontent.com/girafe-ai/ml-mipt/master/datasets/Machine_translation_EN_RU/data.txt

In [6]:
en_val_data_path = 'data.txt'
en_idx = 0
with open(en_val_data_path, 'r') as file:
    en_val_data = [line.strip().lower().split('\t')[en_idx] for line in file]

In [7]:
en_val_data[:2]

['cordelia hotel is situated in tbilisi, a 3-minute walk away from saint trinity church.',
 'at tupirmarka lodge you will find a 24-hour front desk, room service, and a snack bar.']

## Dataloader

In [8]:
class DatasetIterable(IterableDataset):
    
    def __init__(self, file_path):
        super().__init__()
        self.file_path = file_path
    
    def __iter__(self):
        return self.generator()

    def generator(self):
        with open(self.file_path, 'r') as file:
            for line in file:
                yield line

class FastDataset(Dataset):

    def __init__(self, data):

        self.data = data
    
    def __len__(self):

        return len(self.data)
    
    def __getitem__(self, idx):

        return self.data[idx]

In [9]:
class Collator:

    def __init__(self, tokenizer):

        self.tokenizer = tokenizer
    
    def __call__(self, batch):

        ids = self.tokenizer(batch, padding=True, return_tensors='pt').input_ids
        batch = torch.cat(
            (
                ids, 
                torch.full(
                    size=(len(batch), 1),
                    fill_value=self.tokenizer.pad_token_id
                )
            ),
            dim=1
        )
        return batch

In [33]:
num_workers = 4
b_size = 2


if ITERABLE:
    en_train_dataset = DatasetIterable(en_train_data_path)
    en_train_loader = DataLoader(
        en_train_dataset,
        batch_size=b_size,
        collate_fn=Collator(tokenizer)
    )
else:
    en_train_dataset = FastDataset(en_train_data)
    en_train_loader = DataLoader(
        en_train_dataset,
        batch_size=b_size,
        num_workers=num_workers,
        collate_fn=Collator(tokenizer)
    )

en_val_size = 1500
en_val_dataset = FastDataset(en_val_data[:en_val_size])
en_val_loader = DataLoader(
    en_val_dataset,
    batch_size=b_size,
    num_workers=num_workers,
    collate_fn=Collator(tokenizer)
)

# English encoder

In [12]:
class Encoder(nn.Module):

    def __init__(self, vocab_size, emb_size, padding_idx, hid_size):
        super().__init__()

        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.padding_idx = padding_idx
        self.hid_size = hid_size

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_size,
            padding_idx=padding_idx
        )    
        self.rnn = nn.GRU(
            input_size=emb_size,
            hidden_size=hid_size // 2,
            batch_first=True,
            bidirectional=True
        )
        self.to_logits = nn.Linear(hid_size, vocab_size)

    
    def forward(self, batch, h=None, logits_flag=True): # batch: [b_size, seq_len]

        emb = self.embedding(batch) # [b_size, seq_len, emb_size]

        output, h = self.rnn(emb, h)
        # output: [b_size, seq_len, hid_size]
        # h: [2, b_size, hid_size // 2]

        if logits_flag:
            output = self.to_logits(output) # [b_size, seq_len, vocab_size]

        return output, h

In [168]:
class EncoderTrainer(pl.LightningModule):

    def __init__(self, encoder, criterion, optimizer_parameters):
        super().__init__()

        self.encoder = encoder
        self.criterion = criterion
        self.optimizer_parameters = optimizer_parameters

        self.model_current_epoch = 0

    def training_step(self, batch, batch_idx, hiddens=None):

        return self._run_step(batch, batch_idx, hiddens, val_flag=False)

    def training_epoch_end(self, outputs):
        
        if self.truncated_bptt_steps > 0:
            outputs = outputs[0]

        avg_train_loss = torch.mean(torch.tensor([output['loss'] for output in outputs]))
        self.logger.experiment.add_scalar('epoch_loss/training', avg_train_loss, self.model_current_epoch)
        self.model_current_epoch += 1
    
    def validation_step(self, batch, batch_idx):

        return self._run_step(batch, batch_idx, val_flag=True)
    
    def validation_epoch_end(self, outputs):

        avg_val_loss = torch.mean(torch.tensor([output['loss'] for output in outputs]))
        self.logger.experiment.add_scalar('epoch_loss/validation', avg_val_loss, self.model_current_epoch)
        self.log('val_loss', avg_val_loss)
    
    def _run_step(self, batch, batch_idx, hiddens=None, val_flag=False):

        # batch: [b_size, seq_len, hid_size]

        output, new_hiddens = self.encoder(batch, hiddens) # [b_size, seq_len, hid_size]
        loss = self.criterion(
            output[:, :-1, :].permute(0, 2, 1),
            batch[:, 1:]
        )

        if self.truncated_bptt_steps == 0 or val_flag:
            return {'loss': loss}

        return {'loss': loss, 'hiddens': new_hiddens}
    
    def tbptt_split_batch(self, batch, split_size):

        splits = []

        seq_len = batch.shape[1]
        for i in range(0, seq_len, split_size):

            split = batch[:, i:i + split_size]

            if len(split) == 0: break

            splits.append(split)
        
        return splits
    
    def configure_optimizers(self):

        return torch.optim.Adam(self.encoder.parameters(), **self.optimizer_parameters)


## Training

In [14]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

In [15]:
vocab_size = tokenizer.vocab_size
emb_size = 512
hid_size = 512
padding_idx = tokenizer.pad_token_id

In [16]:
criterion = nn.CrossEntropyLoss(ignore_index=padding_idx)
optimzer_parameters = {'lr': 3e-4}

In [17]:
en_encoder = Encoder(vocab_size, emb_size, padding_idx, hid_size)
en_encoder_trainer = EncoderTrainer(en_encoder, criterion, optimzer_parameters)

---

In [18]:
# base trainer parameters
accelerator = 'gpu' if torch.cuda.is_available() else None
print(f'accelerator: {accelerator}')

n_epochs = 1
acc_steps = 256 // b_size

# logger
experiments_dir = 'en_pretraining/encoder'
logger = TensorBoardLogger(save_dir=experiments_dir)

# callbacks
checkpoint_callback = ModelCheckpoint(
    save_top_k=1,
    monitor='val_loss',
    mode='min',
)

accelerator: gpu


In [19]:
trainer = pl.Trainer(
    max_epochs=n_epochs,
    accumulate_grad_batches=acc_steps,
    accelerator=accelerator,
    logger=logger,
    callbacks=[checkpoint_callback]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
# !rm -rf en_pretraining/encoder/lightning_logs

In [None]:
# %reload_ext tensorboard
# %tensorboard --logdir en_pretraining

In [20]:
en_encoder_trainer.truncated_bptt_steps = 30
trainer.fit(en_encoder_trainer, en_train_loader, en_val_loader)

Missing logger folder: en_pretraining/encoder_training/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 57.6 M
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
57.6 M    Trainable params
0         Non-trainable params
57.6 M    Total params
230.571   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

# English decoder

In [21]:
class Decoder(nn.Module):

    def __init__(self, vocab_size, emb_size, padding_idx, hid_size):
        super().__init__()

        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.padding_idx = padding_idx
        self.hid_size = hid_size

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_size,
            padding_idx=padding_idx
        )

        self.rnn = nn.GRU(
            input_size=emb_size,
            hidden_size=hid_size,
            batch_first=True,
            bidirectional=False
        )

        self.to_logits = nn.Linear(hid_size, vocab_size)
    
    def forward(self, batch, h):
        
        # batch: [b_size, seq_len]
        # h: [1, b_size, hid_size]

        emb = self.embedding(batch) # [b_size, seq_len, emb_size]

        output, h = self.rnn(emb, h) # [b_size, seq_len, hid_size]
        logits = self.to_logits(output) # [b_size, seq_len, vocab_size]

        return logits, h

In [22]:
class DecoderTrainer(pl.LightningModule):

    def __init__(self, encoder, decoder, criterion, optimizer_parameters):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.criterion = criterion
        self.optimizer_parameters = optimizer_parameters

        self.model_current_epoch = 0

    def training_step(self, batch, batch_idx, hiddens=None):

        return self._run_step(batch, batch_idx, hiddens, val_flag=False)

    def training_epoch_end(self, outputs):
        
        if self.truncated_bptt_steps > 0:
            outputs = outputs[0]

        avg_train_loss = torch.mean(torch.tensor([output['loss'] for output in outputs]))
        self.logger.experiment.add_scalar('epoch_loss/training', avg_train_loss, self.model_current_epoch)
        self.model_current_epoch += 1
    
    def validation_step(self, batch, batch_idx):

        return self._run_step(batch, batch_idx, val_flag=True)
    
    def validation_epoch_end(self, outputs):

        avg_val_loss = torch.mean(torch.tensor([output['loss'] for output in outputs]))
        self.logger.experiment.add_scalar('epoch_loss/validation', avg_val_loss, self.model_current_epoch)
        self.log('val_loss', avg_val_loss)
    
    def _run_step(self, batch, batch_idx, hiddens=None, val_flag=False):
        
        # encoder_batch: [b_size, seq_len, hid_size] -- defined further
        # batch: [b_size, chunk_len, hid_size]
        
        if hiddens is None: # validation or first iter of the training
        
            if isinstance(batch, tuple): # first iter of the training
                encoder_batch, batch = batch # (full_batch, batch_chunk)
            
            else: # validation
                encoder_batch = batch.clone()

            with torch.no_grad():
                encoder_hiddens = self._get_src_hidden_state(encoder_batch, self.encoder.padding_idx) # [b_size, hid_size]

            hiddens = encoder_hiddens.unsqueeze(0) # [1, b_size, hid_size]

        output, new_hiddens = self.decoder(batch, hiddens)
        # output: [b_size, seq_len, vocab_size]
        # new_hiddens: [b_size, hid_size]
        loss = self.criterion(
            output[:, :-1, :].permute(0, 2, 1),
            batch[:, 1:]
        )

        if self.truncated_bptt_steps == 0 or val_flag:
            return {'loss': loss}

        return {'loss': loss, 'hiddens': new_hiddens}

    def _get_src_hidden_state(self, batch, padding_idx): # [b_size, seq_len]

        output, _ = self.encoder(batch, logits_flag=False) # [b_size, seq_len, hid_size]

        dim0, dim1, dim2 = output.shape
        last_hidden_state_idx = (batch == padding_idx).float().argmax(dim=-1) - 1
        idcs = last_hidden_state_idx.reshape(dim0, 1).expand(dim0, dim2)[:, None, :] 

        h = torch.gather(output, dim=1, index=idcs).squeeze() # [b_size, hid_size]

        return h

    def tbptt_split_batch(self, batch, split_size):

        splits = []

        seq_len = batch.shape[1]
        for i in range(0, seq_len, split_size):

            split = batch[:, i:i + split_size]

            if len(split) == 0: break
            
            if i == 0: # encoder have to be able to give hiddens from whole batch
                splits.append((batch, split))
            else:
                splits.append(split)
        
        return splits
    
    def configure_optimizers(self):

        return torch.optim.Adam(self.decoder.parameters(), **self.optimizer_parameters)


## Training

In [23]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

In [24]:
vocab_size = tokenizer.vocab_size
emb_size = 512
hid_size = 512
padding_idx = tokenizer.pad_token_id

In [25]:
criterion = nn.CrossEntropyLoss(ignore_index=padding_idx)
optimzer_parameters = {'lr': 3e-4}

In [26]:
en_decoder = Decoder(vocab_size, emb_size, padding_idx, hid_size)
en_decoder_trainer = DecoderTrainer(en_encoder, en_decoder, criterion, optimzer_parameters)

---

In [27]:
# base trainer parameters
accelerator = 'gpu' if torch.cuda.is_available() else None
print(f'accelerator: {accelerator}')

n_epochs = 1
acc_steps = 256 // b_size

# logger
experiments_dir = 'en_pretraining/decoder'
logger = TensorBoardLogger(save_dir=experiments_dir)

# callbacks
checkpoint_callback = ModelCheckpoint(
    save_top_k=1,
    monitor='val_loss',
    mode='min',
)

accelerator: gpu


In [28]:
trainer = pl.Trainer(
    max_epochs=n_epochs,
    accumulate_grad_batches=acc_steps,
    accelerator=accelerator,
    logger=logger,
    callbacks=[checkpoint_callback]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [29]:
# !rm -rf en_pretraining/decoder/lightning_logs

In [30]:
# %reload_ext tensorboard
# %tensorboard --logdir en_pretraining/decoder/lightning_logs

In [31]:
en_decoder_trainer.truncated_bptt_steps = 30
trainer.fit(en_decoder_trainer, en_train_loader, en_val_loader)

Missing logger folder: en_pretraining/decoder/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | encoder   | Encoder          | 57.6 M
1 | decoder   | Decoder          | 58.0 M
2 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
115 M     Trainable params
0         Non-trainable params
115 M     Total params
462.715   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

# Seq2Seq: english encoder -> english decoder

In [120]:
import numpy as np
from nltk.translate.bleu_score import corpus_bleu

In [151]:
class Seq2Seq(pl.LightningModule):

    def __init__(self, encoder, decoder, criterion, optimizer_parameters, tokenizer, feed_forward=None):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.feed_forward = self.init_feed_forward(encoder.hid_size) if feed_forward is None else feed_forward

        self.criterion = criterion
        self.optimizer_parameters = optimizer_parameters
        
        self.tokenizer = tokenizer
        self.max_len = 100
        self.temperature = 1.

        self.model_current_epoch = 0
    
    def init_feed_forward(self, hid_size):

        linear = nn.Linear(hid_size, hid_size, bias=False)
        linear.weight.data = torch.eye(hid_size)
        linear.weight.requires_grad = False

        return linear.eval()

    def forward(self, src, trg):
        
        # src: [b_size, seq_len]
        # trg: [b_size, seq_len]

        src_h = self._get_src_hidden_state(src, self.tokenizer.pad_token_id) # [b_size, hid_size]

        trg_h = self.feed_forward(src_h).unsqueeze(0) # [1, b_size, hid_size]

        trg_output, _ = self.decoder(trg, trg_h) # [b_size, seq_len, hid_size]

        return trg_output

    def _get_src_hidden_state(self, batch, padding_idx): # [b_size, seq_len]
            
        output, _ = self.encoder(batch, logits_flag=False) # [b_size, seq_len, hid_size]

        dim0, dim1, dim2 = output.shape
        last_hidden_state_idx = (batch == padding_idx).float().argmax(dim=-1) - 1
        idcs = last_hidden_state_idx.reshape(dim0, 1).expand(dim0, dim2)[:, None, :] 

        h = torch.gather(output, dim=1, index=idcs).squeeze() # [b_size, hid_size]

        return h

    def training_step(self, batch, batch_idx):

        return self._run_step(batch, batch_idx)
    
    def training_epoch_end(self, outputs):

        avg_train_loss = torch.mean(torch.tensor([output['loss'] for output in outputs]))

        self.logger.experiment.add_scalar('epoch_loss/training', avg_train_loss, self.model_current_epoch)
        self.log('train_loss', avg_train_loss)
        self.model_current_epoch += 1
    
    def validation_step(self, batch, batch_idx):
    
        return self._run_step(batch, batch_idx)

    def validation_epoch_end(self, outputs):

        avg_val_loss = torch.mean(torch.tensor([output for output in outputs]))

        self.logger.experiment.add_scalar('epoch_loss/validation', avg_val_loss, self.model_current_epoch)
        self.log('val_loss', avg_val_loss)
    
    def _run_step(self, batch, batch_idx):
        
        if not isinstance(batch, tuple):
            src = batch
            trg = batch.clone()
        else:
            src, trg = batch

        output = self(src, trg)
        loss = self.criterion(output[:, :-1, :].permute(0, 2, 1), trg[:, 1:])

        return loss
    
    def configure_optimizers(self):

        return torch.optim.Adam(
            [
                {'params': self.encoder.parameters()},
                {'params': self.decoder.parameters()}, 
                {'params': self.feed_forward.parameters()}
            ],
            **self.optimizer_parameters
        )
    
    def test_step(self, batch, batch_idx):

        if not isinstance(batch, tuple): # src2src translation
            src = batch
            trg = batch.clone()
        else: # src2trg translation
            src, trg = batch

        src_h = self._get_src_hidden_state(src, self.tokenizer.pad_token_id) # [b_size, hid_size]
        trg_h = self.feed_forward(src_h) # [b_size, hid_size]
        
        bleu_score = self._compute_bleu_score(trg, trg_h)
        
        return bleu_score
    
    def _compute_bleu_score(self, trg, trg_h):
        
        # trg: [b_size, seq_len]
        # trg_h: [b_size, hid_size]
        
        references, hypotheses = [], []
        
        for batch_idx in range(trg.shape[0]):
            reference = trg[batch_idx, :] # [seq_len]
            reference = self.tokenizer.decode(reference, skip_special_tokens=True)
            
            hypothesis = self._predict(trg_h[batch_idx])
            hypothesis = self.tokenizer.decode(hypothesis, skip_special_tokens=True)
            
            references.append([reference])
            hypotheses.append(hypothesis)
        
        self.references = references
        self.hypotheses = hypotheses

        return corpus_bleu(references, hypotheses)

    def _predict(self, trg_h): # [hid_size]

        start_id = self.tokenizer.cls_token_id
        end_id = self.tokenizer.sep_token_id
        
        hiddens = trg_h.unsqueeze(0).unsqueeze(0) # [n_layers, b_size, hid_size] == [1, 1, hid_size]
        
        pred_ids = [start_id]

        for _ in range(self.max_len):

            input_id = torch.LongTensor([pred_ids[-1]]).unsqueeze(0) # [b_size, seq_len] == [1, 1]

            pred, _ = self.decoder(input_id.to(hiddens.device), hiddens) # [1, 1, vocab_size]
            pred = pred.squeeze() # [vocab_size]

            probs = nn.functional.softmax(pred / self.temperature).detach().cpu().numpy()

            token_id = np.random.choice(len(probs), p=probs)
            pred_ids.append(int(token_id))
            
            if token_id == end_id:
                break

        return pred_ids
        
    def set_max_len(self, max_len):
        
        self.max_len = max_len
    
    def set_temperature(self, temperature):

        self.temperature = temperature

In [152]:
criterion = nn.CrossEntropyLoss(ignore_index=padding_idx)
optimizer_parameters = {'lr': 3e-4}
seq2seq = Seq2Seq(en_encoder, en_decoder, criterion, optimizer_parameters, tokenizer)

---

In [110]:
# base trainer parameters
accelerator = 'gpu' if torch.cuda.is_available() else None
print(f'accelerator: {accelerator}')

n_epochs = 1
acc_steps = 256 // b_size

# logger
experiments_dir = 'en_pretraining/seq2seq'
logger = TensorBoardLogger(save_dir=experiments_dir)

# callbacks
checkpoint_callback = ModelCheckpoint(
    save_top_k=1,
    monitor='val_loss',
    mode='min',
)

accelerator: gpu


In [111]:
trainer = pl.Trainer(
    max_epochs=n_epochs,
    accumulate_grad_batches=acc_steps,
    accelerator=accelerator,
    logger=logger,
    callbacks=[checkpoint_callback]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [112]:
# !rm -rf en_pretraining/decoder/lightning_logs

In [113]:
# %reload_ext tensorboard
# %tensorboard --logdir en_pretraining/decoder/lightning_logs

In [114]:
trainer.fit(seq2seq, en_train_loader, en_val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type             | Params
--------------------------------------------------
0 | encoder      | Encoder          | 57.6 M
1 | decoder      | Decoder          | 58.0 M
2 | feed_forward | Linear           | 262 K 
3 | criterion    | CrossEntropyLoss | 0     
--------------------------------------------------
115 M     Trainable params
262 K     Non-trainable params
115 M     Total params
463.764   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

---

In [None]:
trainer.test(seq2seq, en_val_loader)