# HW1: seq2seq nmt

**Homework Goals**

1. Get familiar with text data preparation
2. Learn to work with RNN
3. Train the model to translate `en-->ru`.



In [None]:
%matplotlib inline
from collections import Counter
import unicodedata
import re
import string

import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

## Naive way of texts representation:

0. Normalize spelling
1. Filter out all special characters
2. Split by spaces, do *naive tokenization*

In [None]:
# Prepare data and look at it
# In addition to the dictionary, we are also interested in a set of characters
raw_alphabet = set()
alphabet = set()
def normalize(s):
    return "".join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


def preprocess(s):
    raw_alphabet.update(s)
    s = normalize(s.lower().strip())
    s = re.sub(r"[^a-zа-я?.,!]+", " ", s)
    s = re.sub(r"([.!?])", r" \1", s)
    alphabet.update(s)
    return s

pairs = []
with open('eng-rus.txt', 'r') as fin:
    for line in tqdm(fin.readlines()):
        pair = [preprocess(_) for _ in line.split('\t')]
        pairs.append(pair)
        
print("RAW alphabet {} symbols:".format(len(raw_alphabet)), 
      "".join(sorted(raw_alphabet)))
print("After preprocessing {} symbols: ".format(len(alphabet)), 
      "".join(sorted(alphabet)))
print("There are {} pairs".format(len(pairs)))
print(pairs[10000])

Each word will be assigned a number + we will need special tokens for the beginning and end of the sequence and for unknown words.
`<SOS>, <EOS>, <UNK>`

We have two languages, to work with each we need functions for translating from words to numbers and vice versa.

It is proposed to implement these functions as dictionaries. Allocate the first 4 numbers for special tokens

**(1 point)** Implement the dictionary building function, the function takes a list of strings (normalized sentences, can be splited by spaces) as input. Organize the dictionary in a reasonable way so that rare words can be thrown out if necessary.

In [None]:
COMMON_TOKENS = ['PAD', 'SOS', 'EOS', 'UNK']


def build_vocabs(sents, max_size=1000):
    <your code>
    return tok2idx, idx2tok


eng, rus = list(zip(*pairs))
rus2idx, idx2rus = build_vocabs(rus, max_size=10000)
eng2idx, idx2eng = build_vocabs(eng, max_size=5000)

In [None]:
def sentence2idx(s, tok2idx):
    tokens = preprocess(s).split(' ')
    unk = tok2idx['UNK']
    return [tok2idx['SOS']] + [tok2idx.get(_, unk) for _ in tokens] + [tok2idx['EOS']]


def idx2sentence(s, idx2tok):
    return " ".join(idx2tok[_] for _ in s)

# check the consistency of the transformations
x = sentence2idx('Привет мир!', rus2idx)
print(x)
print(idx2sentence(x, idx2rus))

x = sentence2idx('Hello world!', eng2idx)
print(x)
print(idx2sentence(x, idx2eng))

## Dealing with arbitrary length sequences in pytorch

We need to be able to generate batches of `[bs, 1, seq_len]` tensors.
But in our dataset, the samples are of different lengths:

- we could cut everything down to the minimum length
- padd to maximum length
- choose some average length

**(1 point)** Split the dataset on train and validate:

In [None]:
# make a dataset with encoded pairs:
class EngRusDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, item):
        eng, rus = self.pairs[item]
        return dict(
            eng=eng,
            rus=rus,
        )

encoded = []
for eng, rus in tqdm(pairs):
    a = sentence2idx(eng, eng2idx)
    b = sentence2idx(rus, rus2idx)
    encoded.append((a, b))

    
<your code>
trainset = EngRusDataset(...)
valset = EngRusDataset(...)

Let's build a naive DataLoader and check how it makes batches:


In [None]:
trainloader = DataLoader(trainset, batch_size=8, shuffle=True)
it = iter(trainloader)

In [None]:
batch = next(it)['eng']
batch

In my case, the result was:
```
[tensor([1, 1, 1, 1, 1, 1, 1, 1]),
 tensor([ 6,  7,  6, 15,  5,  6,  5, 62]),
 tensor([ 48,  34,  83,   7,  32, 221,  22,  43]),
 tensor([  5, 143,  37,  36, 129,  12,  11,  66]),
 tensor([  73, 1258,  279,    8,    6,  555,   41,   10]),
 tensor([  8, 140,   8, 628,  20,  96,  13, 270]),
 tensor([  47,    4,   15,   18,   55,  269,    6, 1287]),
 tensor([ 58,   2,  13, 140, 193, 140, 171, 140])]
```

What's weird here?
1. This is not a tensor, but a list of tensors. Accordingly, when iterating over zero dimension (`batch[i, :]`), we will get not an i-example, but i-tokens for all examples in the batch. This is not a problem, but different from the expected behavior.
2. Only one example ends with `<EOS>` (2), the others are cut off to match its length. And this is a problem.

We would like to padd all examples to the maximum length in the batch.
But at the stage of preparing the example (in the `__getitem__` function), we do not know the batch neighbors!
In order to change the batch merging logic, we need to write our own `collate_fn` function in the DataLoader constructor:

```
def collate_fn(samples):
    # samples -- list of dictionaries samples
    <...>
    return batch
```

**(1 point)** Write a `collate_fn` function that padds _correctly_ rus and eng sequences and merges them into batches, where `batch[i, :]` returns the tokens for the `i` example.

Expected output (for a sequence with left padding):

```
tensor([[   1,   10, 3429,  405,  113,  676,   10, 1031,  140,    4,    2],
        [   0,    1,   57,   18,   23,   19,   61,    7,  140,    4,    2],
        [   0,    0,    0,    1,   16,   17, 1131,  416,  140,    4,    2],
        [   0,    0,    0,    1,   13,  465,   75,  197,  140,    4,    2],
        [   0,    0,    0,    1,    6,  302,   13,  144,  140,    4,    2],
        [   0,    1,    6,   59,  205,  167,    8,   15,  140,    4,    2],
        [   0,    0,    0,    0,    1,    6,   14,  678,  140,    4,    2],
        [   0,    0,    1,    5,   29,   67,    6,   14,  140,    4,    2]])
```

In [None]:
def collate_fn(samples):
    # <your code>
    return dict(
        rus=...,
        eng=...,
    )
    

trainloader = DataLoader(trainset, batch_size=8, shuffle=True, collate_fn=collate_fn)
it = iter(trainloader)
next(it)['eng']

Now we have the correct data generator, and all we have to do is write the model (encoder and decoder).


### Encoder

The input tensor contains integers and has dimensions `[bs, seq_len]`,

We will pass them through the layer with embeddings and get the tensor `[bs, seq_len, dim]`. Now we have floating point numbers that can be fed to RNN layers as input.



GRU is an RNN with a specific structure:
<img src="https://habrastorage.org/webt/xt/_q/nj/xt_qnjgfjengqoqd4gizkq4j_wk.png">

In the picture, the yellow rectangles are the line layers with the corresponding activation functions.


`nn.RNN` allows you to create and use multi-layer one- and two-way layers as one layer.
All parameters must be specified during creation, and then simply applied during the forward pass.


The order of dimensions is a bit different from the usual in convolutional networks, this is due to the inability to parallel recurrent calculations effectively.


**batch_first=True**

Such an RNN layer expects two tensors as input:
  - input with sizes `[bs, seq_len, dim]`,
  - hidden_state with dimensions `[num_layers * num_directions, bs, hidden_size]`.
 
 
The output is two tensors:
- output `[bs, seq_len, dim]`,
- hidden `[num_layers * num_directions, bs, hidden]`.

We will apply RNN in two ways:
- to the entire sequence, to translate the entire phrase in one language into one vector (EncoderRNN)
- to one tensor and input token to generate a phrase in another language (DecoderRNN)


We will put the entire input sequence into a hidden state vector.

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, vocab_size, layers=1):
        super().__init__()
        self.layers = layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        self.embeddings = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True, num_layers=layers)
        
    def forward(self, input, hidden):
        embedded = self.embeddings(input)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden
    
    def init_hidden(self, batch_size=1, device=None):
        # be aware about dimension! https://pytorch.org/docs/stable/nn.html#torch.nn.GRU
        return torch.zeros(self.layers, batch_size, self.hidden_size, device=device)


enc = EncoderRNN(256, len(eng2idx))
x = next(it)['eng']
print(x.shape)
hidden = enc.init_hidden(8)
out, hidden = enc(x, hidden)
print(out.shape, hidden.shape)

We want the decoder to generate a translation for us -- a sequence of tokens from another language, using the encoder's hidden state vector.

To do this, we will supply hidden and `<SOS>`token to the input.
At each step, the decoder will return hidden and output vector.
Output vector is the probability distribution for the next token (respectively, it has the size of the output language dictionary).

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, vocab_size, layers=1):
        super().__init__()
        self.layers = layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        self.embeddings = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True, num_layers=layers)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=2)
        
    def forward(self, input, hidden):
        embedded = self.embeddings(input)
        output, hidden = self.rnn(embedded, hidden)
        output = self.softmax(self.out(output))
        return output, hidden
        
    def init_hidden(self, batch_size=1, device=None):
        return torch.zeros(self.layers, batch_size, self.hidden_size, device=device)

In [None]:
dec = DecoderRNN(256, len(rus2idx))

Let's get a tensor with tokens of size `[bs, seq_len]` from the data generator and try to iterate over seq_len to generate the next token.

In [None]:
batch = next(it)['rus'] # get batch
bs, seq_len = batch.shape
for i in range(0, seq_len):
    step = y[:, i].unsqueeze(1)  # get tokens sample for i-th step 
     # These are the correct tokens (ground truth), we could generate them
     # unsqueeze adds dimension 1 (from [bs] to [bs, 1])
    output, hidden = dec(t, hidden)
    print(output.shape, hidden.shape)
    # output -- this is the probability distribution for the next token
    # hidden -- this is the updated hidden state

**(6 points)** Fill in a training part and train the encoder and decoder.

1. You need to write getting the next token (integer) from the distribution: a vector of size `len(rus2idx)`. Since we are working in batches, this should be a batchified operation. You have several options for how to do this:
 - take by argmax
 - sample from distribution (torch.multinomial)
 - during training, take tokens from ground truth (and this must be done at least sometimes so that the model converges).
 
2. You need to write a loss calculation. It is convenient to do this at each step: after the `<EOS>` occurs in the example, you do not need to count the loss for it (in the vectorized version, you can multiply the loss for `<PAD>`-tokens by zero - this is called masking). Loss is simply the sum of cross-entropy losses for each step.


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
def train(model, optimizer, dataloader): 
    encoder, decoder = model
    encoder.to(device)
    decoder.to(device)
    
    encoder.train()
    decoder.train()
    logs = defaultdict(list)
    for batch in tqdm(dataloader):
        rus = batch['rus'].to(device)
        eng = batch['eng'].to(device)
        encoder_hidden = encoder.init_hidden(eng.size(0)).to(device)
        encoder_outputs, hidden = encoder(eng, encoder_hidden)
        
        # write decoder application and loss calculation.
        # hint: loss must be masked, in case the sequence has already ended.
        <your code>
                
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        logs['loss'].append(loss.item())
    return logs

def validate(model, dataloader):
    logs = defaultdict(list)
    for batch in tqdm(dataloader):
        <your code>
        logs['loss'].append(loss.item())
    
    return {k: [np.mean(v)] for k, v in logs.items()}

def plot_logs(logs):
    clear_output()
    plt.figure()
    plt.plot(logs['loss'], zorder=1)
    plt.scatter(logs['steps'], logs['val_loss'], marker='+', s=180, c='orange', label='val', zorder=2)
    plt.show()

    plt.figure()
    # use label&legend to display labels
    # plt.plot(..., label=name)
    # plt.legend() 
    <your code>        
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
encoder = EncoderRNN(256, len(eng2idx)).to(device)
decoder = DecoderRNN(256, len(rus2idx)).to(device)
opt = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=1e-2)
trainloader = DataLoader(trainset, batch_size=128, shuffle=True, collate_fn=collate_fn)
valloader = DataLoader(valset, batch_size=128, shuffle=False, collate_fn=collate_fn)
model = (encoder, decoder)


train_model(model, opt, trainloader, valloader)

**(2 points)** Write a translation function with sampling from a distribution with temperature.

In [None]:
encoder.eval()
decoder.eval()
encoder = encoder.to("cpu")
decoder = decoder.to("cpu")

def evaluate(sentence, T=1.0):
    encoded = sentence2idx(sentence, eng2idx)
    output = []
    print(encoded)
    bs = 10
    with torch.no_grad():
      
        z = torch.LongTensor(encoded).view(1, -1).repeat(bs, 1)
        encoder_outputs, hidden = encoder(z, encoder.init_hidden(bs))
        
        for i in range(20):
            <your code>
            output.append(tokens)
    
    output = np.array(output).T
    for s in output:
        out = idx2sentence(s, idx2rus)
        print(out.replace('PAD', ""))

    
evaluate("What is going on?")