# Assignment 7

Train a Transformer model for Machine Translation from Russian to English.  
Dataset: http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz   
Make all source and target text to lower case.  
Use following tokenization for english:  
```
import sentencepiece as spm

...
spm.SentencePieceTrainer.Train('--input=data/text.en --model_prefix=bpe_en --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

tok_en = spm.SentencePieceProcessor()
tok_en.load('bpe_en.model')

TGT = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_en.encode_as_pieces(x),
    batch_first=True,
)

...
TGT.build_vocab(..., min_freq=5)
...

```
Score: corpus-bleu `nltk.translate.bleu_score.corpus_bleu`  
Use last 1000 sentences for model evalutation (test dataset).  
Use your target sequence tokenization for BLEU score.  
Use max_len=50 for sequence prediction.  


Hint: You may consider much smaller model, than shown in the example.  

Baselines:  
[4 point] BLEU = 0.05  
[6 point] BLEU = 0.10  
[9 point] BLEU = 0.15  

[1 point] Share weights between target embeddings and output dense layer. Notice, they have the same shape.


Readings:
1. BLUE score how to https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
1. Transformer code and comments http://nlp.seas.harvard.edu/2018/04/03/attention.html

In [8]:
!pip install sentencepiece



In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [0]:
import sentencepiece as spm
import torchtext
from torchtext import data
from torchtext import datasets
from nltk.translate.bleu_score import corpus_bleu
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import numpy as np

from torch.utils.data import DataLoader

import warnings
warnings.filterwarnings('ignore')

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import copy 



In [12]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!mkdir data/
!cp drive/My\ Drive/assignment_7/news-commentary-v13.ru-en.en data/
!cp drive/My\ Drive/assignment_7/news-commentary-v13.ru-en.ru data/

In [0]:
from tqdm import tqdm
from IPython.display import clear_output

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

----

### Предобработка данных

In [0]:
def get_data_obj(text_file, prefix, vocab_size=32000, char_coverage=0.98, model_type='bpe', is_src=False):
    with open(text_file) as f:
        dir_ = text_file.split('/')[0]
        path = '/'.join([dir_, 'text.'+ prefix.split('_')[1]])
        with open(path, 'w') as out:
            out.write(f.read().lower())
    
    request = ' '.join(["--input=" + path, 
                        "--model_prefix=" + prefix,
                        "--vocab_size=" + str(vocab_size),
                        "--character_coverage=" + str(char_coverage),
                        "--model_type=" + model_type])
    spm.SentencePieceTrainer.Train(request)
    token = spm.SentencePieceProcessor()
    token.load('.'.join([prefix , 'model']))
    tokenizer = lambda x: token.encode_as_pieces(x)
    obj = torchtext.data.Field(
        fix_length=50,
        pad_token='<pad>',
        unk_token='<unk>',
        init_token='<s>',
        eos_token='</s>',
        lower=True,
        tokenize = tokenizer,
        batch_first=True,
    )
    
    return obj

In [0]:
TGT = get_data_obj(text_file='data/news-commentary-v13.ru-en.en',
                   prefix="bpe_en")

SRC = get_data_obj(text_file="data/news-commentary-v13.ru-en.ru", is_src=True,
                    prefix='bpe_ru', char_coverage=1.)

In [0]:
fields = [('src', SRC), ('trg', TGT)]

In [19]:
with open('data/text.ru') as f:
    src_snt = list(map(str.strip, f.readlines()))
    
with open('data/text.en') as f:
    tgt_snt = list(map(str.strip, f.readlines()))
    
examples = [data.Example.fromlist(x, fields) for x in tqdm(zip(src_snt, tgt_snt))]
test = data.Dataset(examples[-1000:], fields)

split_id = round(len(examples[:-1000])*0.9)
train = data.Dataset(examples[:split_id], fields)
valid = data.Dataset(examples[split_id:-1000], fields)

# будем брать одни и те же данные для воспроизводимости и возможности сохранения
# и, соответственно, загрузки моделей.

235159it [01:34, 2493.25it/s]


In [20]:
print('src: ' + " ".join(train.examples[150].src))
print('tgt: ' + " ".join(train.examples[150].trg))

src: ▁несколько ▁недель ▁спустя , ▁я ▁столкнулась ▁с ▁одним ▁из ▁подобных ▁конфликтов : ▁собрание ▁членов ▁правления ▁здесь , ▁конференция ▁там , ▁и ▁еще ▁одна ▁перспектива ▁в ▁то ▁же ▁самое ▁время ▁еще ▁в ▁одном ▁месте .
tgt: ▁a ▁couple ▁of ▁weeks ▁later , ▁i ▁was ▁faced ▁with ▁one ▁of ▁those ▁conflicts : ▁a ▁board ▁meeting ▁here , ▁a ▁conference ▁there , ▁another ▁opportunity ▁at ▁the ▁same ▁time ▁somewhere ▁else .


In [21]:
len(train), len(valid), len(test)

(210743, 23416, 1000)

In [0]:
TGT.build_vocab(train, min_freq=5)
SRC.build_vocab(train, min_freq=5)

In [0]:
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data).detach()
        return tgt_mask

class BucketIteratorWrapper(DataLoader):
    __initialized = False

    def __init__(self, iterator: data.Iterator):
#         super(BucketIteratorWrapper,self).__init__()
        self.batch_size = iterator.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iterator
        self.batch_sampler = iterator
        self.__initialized = True

    def __iter__(self):
        return map(
            lambda batch: Batch(batch.src, batch.trg, pad=TGT.vocab.stoi['<pad>']),
            self.batch_sampler.__iter__()
        )

    def __len__(self):
        return len(self.batch_sampler)
    
class MyCriterion(nn.Module):
    def __init__(self, pad_idx):
        super(MyCriterion, self).__init__()
        self.pad_idx = pad_idx
        self.criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=pad_idx)
        
    def forward(self, x, target):
        x = x.contiguous().permute(0,2,1)
        ntokens = (target != self.pad_idx).data.sum()
        
        return self.criterion(x, target) / ntokens

In [0]:
torch.cuda.empty_cache()


train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), 
                                              batch_sizes=(64, 64, 1), 
                                  sort_key=lambda x: len(x.src),
                                  shuffle=True,
                                  device=device,
                                  sort_within_batch=False)
                                  
train_iter = BucketIteratorWrapper(train_iter)
valid_iter = BucketIteratorWrapper(valid_iter)
test_iter = BucketIteratorWrapper(test_iter)

## Вспомогательные функции

In [0]:
#model from transformer.py

class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, batch):
        src, tgt = batch.src, batch.trg
        src_mask, tgt_mask = batch.src_mask, batch.trg_mask
        "Take in and process masked src and target sequences."
        return self.decode(tgt, tgt_mask, self.encode(src, src_mask), src_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, tgt, tgt_mask, memory, src_mask):
        x = self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
        x = self.generator(x)
        return x

class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)


def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)
    
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
    
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))
    
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

    
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)
    
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)


def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / np.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)
    
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))
    
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * np.sqrt(self.d_model)
    
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].clone().detach()
        return self.dropout(x)
    
    
def make_model(src_vocab, tgt_vocab, N=4, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

In [0]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [0]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0


-----

### Создание и обучение модели

In [0]:
pad_idx = TGT.vocab.stoi['<pad>']
model = make_model(len(SRC.vocab), len(TGT.vocab), N=2)
# model.cuda()
model_opt = get_std_opt(model)
criterion = MyCriterion(pad_idx)
# criterion.cuda();

In [0]:
def train_epoch(data_iter, model, criterion):
    total_loss = 0
    counter = 0
    for batch in data_iter:
        pred = model.forward(batch)
        loss = criterion(pred, batch.trg_y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        model_opt.step()
        
        total_loss += loss
        # data_iter.set_postfix(loss = loss)
        counter +=1
        
    total_loss /= counter
    return total_loss

def valid_epoch(data_iter, model, criterion):
    total_loss = 0
    counter = 0
    for batch in data_iter:
        pred = model.forward(batch)
        clear_output(True)
        loss = criterion(pred, batch.trg_y).item()
        print(loss)
        total_loss += loss
        # data_iter.set_postfix(loss = loss)
        counter +=1
        
    total_loss /= counter
    return total_loss

#shared weights
model.generator.proj.weight = model.tgt_embed[0].lut.weight

for epoch in range(10):
    model.train()
    loss = train_epoch(train_iter, model, criterion)
    print('train', loss)
    
    model.eval()
    with torch.no_grad():
        loss = valid_epoch(valid_iter, model, criterion)
        print('valid', loss)

4.104262828826904
valid 2.3165018174166234


In [0]:
def beam_search(model, src, src_mask, max_len=5, k=5):
    memory = model.encode(src, src_mask)
    start_s = torch.ones(1, 1).fill_(TGT.vocab.stoi['<s>']).type_as(src.data)
    beam = [(start_s, 0)]
    for i in range(max_len):
        chars = []
        probas = []
        for ys, log_prob in beam:
            if ys[0][-1] == TGT.vocab.stoi['</s>']:
                chars.append(ys)
                probas.append(log_prob)
            else:
                tgt_mask = subsequent_mask(ys.size(-1)).type_as(src.data).detach().to(device)
                probs = model.decode(ys, tgt_mask, memory, src_mask)[0][i]
                probs, idxs = torch.topk(probs, k)
                for idx, new_c in enumerate(idxs.squeeze(0)):
                    new_c_tensor = torch.ones(1, 1).type_as(src.data).fill_(new_c).to(device)
                    chars.append(torch.cat([ys, new_c_tensor], dim=1))
                    probas.append(log_prob + np.log(probs[idx].item()))
        probas = torch.tensor(probas)
        vals, idxs = torch.topk(probas, k)
        beam = [(c, p.item()) for c, p in zip(chars, probas)]
    return beam

In [0]:
torch.save(model.state_dict(), 'model_de_best.pth.tar')

In [0]:
!cp model_de_best.pth.tar drive/My\ Drive/assignment_7/

In [0]:
model_path = 'drive/My Drive/assignment_7/model_de_best.pth.tar'

model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda')))

In [0]:
hypotheses = []
references = []

model.eval()
with torch.no_grad():
    for batch in test_iter:        
        src = batch.src[:1]
        trg = batch.trg[:1][0]
        src_mask = src != SRC.vocab.stoi["<pad>"]

        beam = beam_search(model, src, src_mask)

        pred, log_proba = beam[0]
        tokens = pred[0]

        new_hyp = []
        for i in range(1, len(tokens)):
            sym = TGT.vocab.itos[tokens[i]]
            if sym == '</s>': break
            new_hyp.append(sym)
        hypotheses.append(new_hyp)

        new_ref = []
        for i in range(1, len(trg)):
            sym = TGT.vocab.itos[trg[i]]
            if sym == '</s>': break
            new_ref.append(sym)
        references.append([new_ref])

In [0]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [0]:
final_score = corpus_bleu(references[1:], hypotheses[1:],
                          smoothing_function=SmoothingFunction().method3,
                          auto_reweigh=True)

In [0]:
print("final score is {}".format(final_score))

final score is 0.19645950075706967


Оценим количество неизвестных токенов в предложениях

In [0]:
unk_token = TGT.vocab.stoi['<unk>']

sum_unks = 0
for tokenized in references[1:]:
    for token in tokenized[0]:
        if TGT.vocab.stoi[token] == unk_token:
            sum_unks += 1
            break

print(sum_unks/len(references[1:]))

0.1061061061061061


Видно, что предложений с оригинальным переводом, в которых встречаются неизвестные токены, около 10%, но несмотря на это модель позволяет добиться сколько-то приемлемого качества.