## LSTM + Attention


In [1]:
!pip install torchtext==0.11.0
!pip  install subword-nmt
!pip install wget

Collecting torchtext==0.11.0
  Downloading torchtext-0.11.0-cp37-cp37m-manylinux1_x86_64.whl (8.0 MB)
[K     |████████████████████████████████| 8.0 MB 5.1 MB/s 
Collecting torch==1.10.0
  Downloading torch-1.10.0-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)
[K     |██████████████████████████████▎ | 834.1 MB 1.3 MB/s eta 0:00:38tcmalloc: large alloc 1147494400 bytes == 0x38da8000 @  0x7f23e1d9d615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7
[K     |████████████████████████████████| 881.9 MB 19 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
  Attempting uninstall: torchtext
    Found existin

In [39]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy.datasets import TranslationDataset, Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy

import random
import math
import time

import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE

import tqdm
from nltk.translate.bleu_score import corpus_bleu

import utils
get_text = utils.get_text


In [2]:
torchtext.__version__

'0.11.0'

In [3]:
from data_preparation import DataPreparation

0.11.0


In [4]:
dataclass = DataPreparation()

In [5]:
train_data, valid_data, test_data = dataclass.data_pipeline()

download data
creating dataset
create train, valid and test data
Number of training examples: 40000
Number of validation examples: 2500
Number of testing examples: 7500
build vocab
Unique tokens in source (ru) vocabulary: 9305
Unique tokens in target (en) vocabulary: 6695


### Model side
__Here comes simple pipeline of NMT model learning. It almost copies the week03 practice__

In [149]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [150]:
device

device(type='cuda')

In [151]:
def _len_sort_key(x):
    return len(x.src)

BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key=_len_sort_key
)

In [152]:
import LSTM_Attention
Encoder = LSTM_Attention.Encoder
Decoder = LSTM_Attention.Decoder
Attention = LSTM_Attention.Attention
Seq2Seq = LSTM_Attention.Seq2Seq

In [153]:
INPUT_DIM = len(dataclass.SRC.vocab)
OUTPUT_DIM = len(dataclass.TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
model = Seq2Seq(enc, dec, device).to(device)

In [154]:
def initialize_weights(m):
  if hasattr(m, 'weight') and m.weight.dim() > 1:
    nn.init.xavier_uniform_(m.weight.data)

model.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(9305, 256)
    (rnn): LSTM(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(6695, 256)
    (rnn): LSTM(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=6695, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [155]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 24,240,423 trainable parameters


In [156]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in  enumerate(tqdm.tqdm_notebook((iterator))):
        src = batch.src
        trg = batch.trg
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [157]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    original_text = []
    generated_text = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm.tqdm_notebook((iterator))):
            src = batch.src
            trg = batch.trg
            output = model(src, trg, 0) #turn off teacher forcing

            output_for_bleu = output.argmax(dim=-1)
            original_text.extend([get_text(x, dataclass.TRG.vocab) for x in trg.cpu().numpy().T])
            generated_text.extend([get_text(x, dataclass.TRG.vocab) for x in output_for_bleu[1:].detach().cpu().numpy().T])

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)

            epoch_loss += loss.item()
    bleu = corpus_bleu([[text] for text in original_text], generated_text) * 100        
    return epoch_loss / len(iterator), bleu

train

In [None]:
N_EPOCHS = 20
CLIP = 1
train_history = []
valid_history = []

best_valid_loss = float('inf')



lr = 1e-2
PAD_IDX = dataclass.TRG.vocab.stoi['<pad>']
optimizer = optim.Adam(model.parameters(), lr = lr)
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.5)


for epoch in range(N_EPOCHS):   
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss, bleu = evaluate(model, valid_iterator, criterion)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    train_history.append(train_loss)
    valid_history.append([valid_loss, bleu])
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | lr: {optimizer.param_groups[0]["lr"]}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f} | val. bleu: {bleu}')
    if epoch!=0 and epoch % 2 ==0:
      scheduler.step()


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/625 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


  0%|          | 0/40 [00:00<?, ?it/s]

	Train Loss: 3.305 | Train PPL:  27.236 | lr: 0.01
	 Val. Loss: 4.831 |  Val. PPL: 125.283 | val. bleu: 14.000707054216404


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

__Let's take a look at our network quality__:

In [None]:
import imp
imp.reload(utils)
generate_translation = utils.generate_translation
remove_tech_tokens = utils.remove_tech_tokens
flatten = utils.flatten

In [None]:
batch = next(iter(test_iterator))

In [None]:
for idx in [1,2]:
    src = batch.src[:, idx:idx+1]
    trg = batch.trg[:, idx:idx+1]
    generate_translation(src, trg, model, dataclass.TRG.vocab)

Original: there is a 24 - hour front desk at the property .
Generated: a a - screen tv .

Original: the property offers free parking .
Generated: free parking parking is available on site .



In [None]:
original_text = []
generated_text = []
model.eval()
with torch.no_grad():
    for i, batch in tqdm.tqdm(enumerate(test_iterator)):
        src = batch.src
        trg = batch.trg
        output = model(src, trg, 0)
        original_text.extend([get_text(x, dataclass.TRG.vocab) for x in trg.cpu().numpy().T])
        generated_text.extend([get_text(x, dataclass.TRG.vocab) for x in output[1:].detach().cpu().numpy().T])

59it [00:08,  6.63it/s]


In [None]:
corpus_bleu([[text] for text in original_text], generated_text) * 100

8.742665469856739

Baseline solution BLEU score is quite low. Try to achieve at least __18__ BLEU on the test set. 
The checkpoints are:

* __18__ - minimal score to submit the homework, 30% of points

* __20__ - good score, 70% of points

* __25__ - excellent score, 100% of points