[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1aR1WGr_5DcZ-IrAqFIN5gIblpZ0q1tGu?usp=sharing)

# Техническая часть

In [1]:
!git clone https://github.com/DanilDmitriev1999/seq2seq

Cloning into 'seq2seq'...
remote: Enumerating objects: 56, done.[K
remote: Counting objects: 100% (56/56), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 56 (delta 13), reused 46 (delta 9), pack-reused 0[K
Unpacking objects: 100% (56/56), done.


In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import math
import json
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.translate.bleu_score import corpus_bleu

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.cuda import FloatTensor, LongTensor

from torchtext.data import Field, Example, Dataset, BucketIterator

np.random.seed(40)
torch.manual_seed(40)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Данные

In [3]:
class PrepareData:
    def __init__(self, train_path, val_path, batch_size, batch_first=False,
                 include_lingths=True, init_token=False):
        assert type(batch_first) == bool
        assert type(include_lingths) == bool
        assert type(init_token) == bool
        assert train_path[-3:] == 'txt'

        self.train_path = train_path
        self.val_path = val_path
        self.batch_first = batch_first
        self.include_lingths = include_lingths
        self.batch_size = batch_size
        self.train_path = train_path
        self.val_path = val_path

        self.BOS_TOKEN = '<s>'
        self.EOS_TOKEN = '</s>'
        if init_token:
            self.source_field = Field(tokenize='spacy', init_token=self.BOS_TOKEN, eos_token=self.EOS_TOKEN,
                     lower=True, include_lengths=self.include_lingths, batch_first=self.batch_first)
        else:
            self.source_field = Field(tokenize='spacy', init_token=None, eos_token=self.EOS_TOKEN,
                     lower=True, include_lengths=self.include_lingths, batch_first=self.batch_first)

        self.target_field = Field(tokenize='spacy', init_token=self.BOS_TOKEN, eos_token=self.EOS_TOKEN,
                             lower=True, batch_first=self.batch_first)
        self.fields = [('source', self.source_field), ('target', self.target_field)]

    
    def read_data(self, path, total):
        MAX_TOKENS_COUNT = 23
        examples = []
        with open(path, 'r', encoding='utf-8') as f:
            for line in tqdm(f, total=total):
                line_dict = eval(line)
                source_text = self.source_field.preprocess(line_dict['src'])
                target_text = self.target_field.preprocess(line_dict['tgt'])
                if len(source_text) <= MAX_TOKENS_COUNT and len(target_text) <= MAX_TOKENS_COUNT:
                    examples.append(Example.fromlist([source_text, target_text], self.fields))
        return examples

    def start(self):
        train_examples = self.read_data(path=self.train_path, total=34304)
        val_examples = self.read_data(path=self.val_path, total=4384)

        train_dataset = Dataset(train_examples, self.fields)
        test_dataset = Dataset(val_examples, self.fields)

        print('Train size =', len(train_dataset))
        print('Test size =', len(test_dataset))
        print('Example data = ', vars(train_dataset.examples[0]))

        self.source_field.build_vocab(train_dataset, min_freq=2)
        print('Source vocab size =', len(self.source_field.vocab))

        self.target_field.build_vocab(train_dataset, min_freq=2)
        print('Target vocab size =', len(self.target_field.vocab))

        train_iter, test_iter = BucketIterator.splits(
            datasets=(train_dataset, test_dataset),
            batch_sizes=(self.batch_size, self.batch_size),
            sort_within_batch = True,
            sort_key = lambda x : len(x.source),
            device=DEVICE,
        )
        return train_iter, test_iter, self.source_field, self.target_field

In [None]:
train_path = 'seq2seq/data/train.txt'
val_path = 'seq2seq/data/dev.txt'
prepare = PrepareData(train_path, val_path, batch_size=32)
train_iter, test_iter, source_field, target_field = prepare.start()

100%|██████████| 34304/34304 [00:07<00:00, 4502.94it/s]
100%|██████████| 4384/4384 [00:01<00:00, 4207.19it/s]


Train size = 25146
Test size = 3015
Source vocab size = 7567
Target vocab size = 7509


# seq2seq with DotAttention

Encoder должен быть подобен символьной сеточке в POS tagging'е: эмбеддить токены и запускать rnn'ку (в данном случае будем пользоваться GRU) и отдавать последнее скрытое состояние.

Decoder почти такой же, только еще и предсказывает токены на каждом своем шаге.

Идея Attention (механизма внимания) запоминать все скрытые состояния encoder, а не только последний вектор. Дальше, для вычисления нового слова при генерации найдем сначала представление уже сгенерированного контекста (по которому обычно и генерируется следующее слово).

По этому представлению посчитаем оценки полезности состояний энкодера: attention weights. Чем выше вес - тем более полезно состояние. (Можно, кстати, представлять, что в предыдущем варианте мы просто давали всем состояниям кроме последнего вес 0, а последнему - 1).

С этими весами состояния энкодера суммируются, и мы получаем взвешенный вектор-представление контекста. Опять вектор?! Но теперь этот вектор получен для конкретного генерируемого слова - это же гораздо лучше, чем пытаться сделать один вектор сразу для всех генерируемых слов. 

From  [Отличный курс по NLP](https://github.com/DanAnastasyev/DeepNLP-Course)


In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, rnn_hidden_dim=256, num_layers=1):
        super().__init__()

        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(input_size=emb_dim, hidden_size=rnn_hidden_dim, 
                           num_layers=num_layers)
        
        self.dropout = nn.Dropout(0.4)
        

    def forward(self, inputs, src_len, hidden=None):
        # input = [encoder_seq_len, batch_size]
        # hidden = [1, batch_size, rnn_hidden_dim]
        emb = self.emb(inputs)
        # emb = self.dropout(emb)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(emb, src_len)
        packed_outputs, encoder_hidden = self.rnn(packed_embedded)
        encoder_output, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 
        return encoder_output, encoder_hidden

В общем случае, attention работает так: пусть у нас есть набор скрытых состояний $\mathbf{s}_1, \ldots, \mathbf{s}_m$ - представлений слов из исходного языка, полученных с помощью энкодера. И есть некоторое текущее скрытое состояние $\mathbf{h}_i$ - скажем, представление, используемое для предсказания слова на нужном нам языке.

Тогда с помощью аттеншена мы можем получить взвешенное представление контекста $\mathbf{s}_1, \ldots, \mathbf{s}_m$ - вектор $\mathbf{c}_i$:
$$
\begin{align}\begin{split}
\mathbf{c}_i &= \sum\limits_j a_{ij}\mathbf{s}_j\\
\mathbf{a}_{ij} &= \text{softmax}(f_{att}(\mathbf{h}_i, \mathbf{s}_j))
\end{split}\end{align}
$$

$f_{att}$ - функция, которая говорит, насколько хорошо $\mathbf{h}_i$ и $\mathbf{s}_j$ подходят друг другу.

Я рассматривал только dotattention, чтобы потом было проще с transformer

- Dot attention:
$$f_{att}(\mathbf{h}_i, \mathbf{s}_j) = \mathbf{h}_i^\top \mathbf{s}_j$$

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, rnn_hidden_dim=256, attn_dim=128, num_layers=1):
        super().__init__()

        self.fc_0 = nn.Linear(2 * rnn_hidden_dim, rnn_hidden_dim)

        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(input_size=emb_dim + rnn_hidden_dim, hidden_size=rnn_hidden_dim, num_layers=num_layers)
        self.out = nn.Linear(rnn_hidden_dim, vocab_size)

        self.dropout = nn.Dropout(0.4)

    @staticmethod
    def dot_attention(query, key, value, mask):
        # query = [batch_size, query_size]
        # key = [encoder_seq_len, batch_size, key_size]
        # value = [encoder_seq_len, batch_size, key_size]
        # mask = [encoder_seq_len, batch_size]
        
        f_att = torch.matmul(query, key.transpose(-2, -1))
        f_att.data.masked_fill_(mask.unsqueeze(-2), -float('inf'))
        weights = F.softmax(f_att, -1)
        output = torch.matmul(weights, value)
        return output.sum(0), weights

    def forward(self, inputs, encoder_output, encoder_mask, encoder_hidden, output=None, hidden=None):
        # inputs = [decoder_seq_len, batch_size]
        # encoder_output = [encoder_seq_len, batch_size, rnn_hidden_dim]
        # encoder_mask = [encoder_seq_len, batch_size]
        # hidden = [1, batch_size, rnn_hidden_dim]
        if output is None:
            output = self.fc_0(encoder_output)

        embs = self.emb(inputs)
        # embs = self.dropout(embs)
        outputs, attentions = [], []

        for i in range(embs.shape[0]):
            context, weights = self.dot_attention(query=hidden, key=output, value=output, mask=encoder_mask)
            context = context.unsqueeze(0)
            rnn_input = torch.cat((embs[i: i+1], context), -1)
            out, hidden = self.rnn(rnn_input, hidden)

            outputs.append(out)
            attentions.append(weights)

        out = torch.cat(outputs)
        attention = torch.cat(attentions)
        return self.out(out), hidden, attention

In [None]:
class Seq2seq_with_attention(nn.Module):
    def __init__(self, source_vocab_size, target_vocab_size, emb_dim=64, rnn_hidden_dim=128, 
                 attn_dim=128, num_layers=1):

        super().__init__()
        self.encoder = Encoder(source_vocab_size, emb_dim, rnn_hidden_dim, num_layers)
        self.decoder = Decoder(target_vocab_size, emb_dim, rnn_hidden_dim, attn_dim, num_layers)

    def forward(self, source_inputs, source_len, target_inputs):
        encoder_mask = source_inputs == 1.  # добавим маску, чтобы игнорировать паддинги
        encoder_output, encoder_hidden = self.encoder(source_inputs, source_len)

        output = encoder_output
        hidden = encoder_hidden

        return self.decoder(target_inputs, encoder_output, encoder_mask, encoder_hidden, output, hidden)

In [None]:
from seq2seq.utils.utilsSeq2seq import *

model = Seq2seq_with_attention(source_vocab_size=len(source_field.vocab),
                               target_vocab_size=len(target_field.vocab)).to(DEVICE)

pad_idx = target_field.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx).to(DEVICE)

optimizer = optim.Adam(model.parameters())

fit(model, criterion, optimizer, train_iter, epochs_count=35, val_iter=test_iter)

[1 / 35] Train: Loss = 5.35208, PPX = 211.05: 100%|██████████| 786/786 [00:22<00:00, 35.27it/s]
[1 / 35]   Val: Loss = 4.72202, PPX = 112.40: 100%|██████████| 95/95 [00:01<00:00, 76.79it/s]
[2 / 35] Train: Loss = 5.03307, PPX = 153.40:   1%|          | 4/786 [00:00<00:43, 17.82it/s]


Val BLEU = 1.71


[2 / 35] Train: Loss = 4.44834, PPX = 85.48: 100%|██████████| 786/786 [00:22<00:00, 35.36it/s]
[2 / 35]   Val: Loss = 4.27227, PPX = 71.68: 100%|██████████| 95/95 [00:01<00:00, 76.91it/s]
[3 / 35] Train: Loss = 2.14084, PPX = 8.51:   1%|          | 6/786 [00:00<00:26, 29.01it/s] 


Val BLEU = 3.27


[3 / 35] Train: Loss = 4.00010, PPX = 54.60: 100%|██████████| 786/786 [00:22<00:00, 34.73it/s]
[3 / 35]   Val: Loss = 3.98248, PPX = 53.65: 100%|██████████| 95/95 [00:01<00:00, 78.73it/s]
[4 / 35] Train: Loss = 3.90705, PPX = 49.75:   1%|          | 5/786 [00:00<00:33, 23.43it/s]


Val BLEU = 5.21


[4 / 35] Train: Loss = 3.66598, PPX = 39.09: 100%|██████████| 786/786 [00:22<00:00, 35.40it/s]
[4 / 35]   Val: Loss = 3.77628, PPX = 43.65: 100%|██████████| 95/95 [00:01<00:00, 78.85it/s]
[5 / 35] Train: Loss = 4.21766, PPX = 67.87:   1%|          | 5/786 [00:00<00:42, 18.27it/s]


Val BLEU = 6.92


[5 / 35] Train: Loss = 3.37778, PPX = 29.31: 100%|██████████| 786/786 [00:22<00:00, 35.53it/s]
[5 / 35]   Val: Loss = 3.59075, PPX = 36.26: 100%|██████████| 95/95 [00:01<00:00, 79.29it/s]
[6 / 35] Train: Loss = 3.54573, PPX = 34.66:   1%|          | 5/786 [00:00<00:40, 19.14it/s]


Val BLEU = 8.94


[6 / 35] Train: Loss = 3.13372, PPX = 22.96: 100%|██████████| 786/786 [00:21<00:00, 35.84it/s]
[6 / 35]   Val: Loss = 3.43560, PPX = 31.05: 100%|██████████| 95/95 [00:01<00:00, 78.35it/s]
[7 / 35] Train: Loss = 2.41349, PPX = 11.17:   1%|          | 6/786 [00:00<00:28, 26.94it/s]


Val BLEU = 11.03


[7 / 35] Train: Loss = 2.92075, PPX = 18.56: 100%|██████████| 786/786 [00:22<00:00, 35.64it/s]
[7 / 35]   Val: Loss = 3.33264, PPX = 28.01: 100%|██████████| 95/95 [00:01<00:00, 77.84it/s]
[8 / 35] Train: Loss = 2.38917, PPX = 10.90:   1%|          | 6/786 [00:00<00:29, 26.06it/s]


Val BLEU = 12.08


[8 / 35] Train: Loss = 2.74058, PPX = 15.50: 100%|██████████| 786/786 [00:21<00:00, 35.82it/s]
[8 / 35]   Val: Loss = 3.24672, PPX = 25.71: 100%|██████████| 95/95 [00:01<00:00, 80.11it/s]
[9 / 35] Train: Loss = 2.82093, PPX = 16.79:   1%|          | 5/786 [00:00<00:29, 26.61it/s]


Val BLEU = 13.26


[9 / 35] Train: Loss = 2.58002, PPX = 13.20: 100%|██████████| 786/786 [00:22<00:00, 35.63it/s]
[9 / 35]   Val: Loss = 3.19319, PPX = 24.37: 100%|██████████| 95/95 [00:01<00:00, 80.28it/s]
[10 / 35] Train: Loss = 1.49619, PPX = 4.46:   1%|          | 6/786 [00:00<00:28, 27.19it/s] 


Val BLEU = 13.86


[10 / 35] Train: Loss = 2.43808, PPX = 11.45: 100%|██████████| 786/786 [00:21<00:00, 35.87it/s]
[10 / 35]   Val: Loss = 3.12165, PPX = 22.68: 100%|██████████| 95/95 [00:01<00:00, 77.90it/s]
[11 / 35] Train: Loss = 1.51901, PPX = 4.57:   1%|          | 7/786 [00:00<00:24, 31.65it/s] 


Val BLEU = 14.58


[11 / 35] Train: Loss = 2.31174, PPX = 10.09: 100%|██████████| 786/786 [00:21<00:00, 36.04it/s]
[11 / 35]   Val: Loss = 3.08245, PPX = 21.81: 100%|██████████| 95/95 [00:01<00:00, 80.26it/s]
[12 / 35] Train: Loss = 2.32432, PPX = 10.22:   1%|          | 5/786 [00:00<00:29, 26.75it/s]


Val BLEU = 14.99


[12 / 35] Train: Loss = 2.19724, PPX = 9.00: 100%|██████████| 786/786 [00:22<00:00, 34.69it/s]
[12 / 35]   Val: Loss = 3.04814, PPX = 21.08: 100%|██████████| 95/95 [00:01<00:00, 75.41it/s]
[13 / 35] Train: Loss = 2.44004, PPX = 11.47:   1%|          | 6/786 [00:00<00:26, 29.97it/s]


Val BLEU = 15.30


[13 / 35] Train: Loss = 2.09673, PPX = 8.14: 100%|██████████| 786/786 [00:22<00:00, 34.95it/s]
[13 / 35]   Val: Loss = 3.03157, PPX = 20.73: 100%|██████████| 95/95 [00:01<00:00, 78.61it/s]
[14 / 35] Train: Loss = 0.56481, PPX = 1.76:   1%|          | 5/786 [00:00<00:40, 19.36it/s]


Val BLEU = 15.66


[14 / 35] Train: Loss = 2.00494, PPX = 7.43: 100%|██████████| 786/786 [00:21<00:00, 35.73it/s]
[14 / 35]   Val: Loss = 2.99231, PPX = 19.93: 100%|██████████| 95/95 [00:01<00:00, 82.27it/s]
[15 / 35] Train: Loss = 2.84585, PPX = 17.22:   1%|          | 5/786 [00:00<00:31, 24.89it/s]


Val BLEU = 16.24


[15 / 35] Train: Loss = 1.91949, PPX = 6.82: 100%|██████████| 786/786 [00:21<00:00, 35.95it/s]
[15 / 35]   Val: Loss = 2.96873, PPX = 19.47: 100%|██████████| 95/95 [00:01<00:00, 77.52it/s]
[16 / 35] Train: Loss = 1.84557, PPX = 6.33:   1%|          | 4/786 [00:00<00:41, 18.92it/s] 


Val BLEU = 16.27


[16 / 35] Train: Loss = 1.84555, PPX = 6.33: 100%|██████████| 786/786 [00:21<00:00, 35.77it/s]
[16 / 35]   Val: Loss = 2.96463, PPX = 19.39: 100%|██████████| 95/95 [00:01<00:00, 77.93it/s]
[17 / 35] Train: Loss = 1.69975, PPX = 5.47:   1%|          | 6/786 [00:00<00:25, 30.09it/s]


Val BLEU = 16.61


[17 / 35] Train: Loss = 1.77380, PPX = 5.89: 100%|██████████| 786/786 [00:22<00:00, 35.72it/s]
[17 / 35]   Val: Loss = 2.94289, PPX = 18.97: 100%|██████████| 95/95 [00:01<00:00, 79.20it/s]
[18 / 35] Train: Loss = 1.89265, PPX = 6.64:   1%|          | 6/786 [00:00<00:28, 27.81it/s] 


Val BLEU = 16.94


[18 / 35] Train: Loss = 1.71137, PPX = 5.54: 100%|██████████| 786/786 [00:22<00:00, 35.46it/s]
[18 / 35]   Val: Loss = 2.93495, PPX = 18.82: 100%|██████████| 95/95 [00:01<00:00, 79.93it/s]
[19 / 35] Train: Loss = 0.22658, PPX = 1.25:   1%|          | 6/786 [00:00<00:40, 19.27it/s]


Val BLEU = 17.13


[19 / 35] Train: Loss = 1.65205, PPX = 5.22: 100%|██████████| 786/786 [00:22<00:00, 34.56it/s]
[19 / 35]   Val: Loss = 2.92817, PPX = 18.69: 100%|██████████| 95/95 [00:01<00:00, 79.08it/s]
[20 / 35] Train: Loss = 1.83132, PPX = 6.24:   1%|          | 6/786 [00:00<00:28, 27.13it/s]


Val BLEU = 17.37


[20 / 35] Train: Loss = 1.59605, PPX = 4.93: 100%|██████████| 786/786 [00:22<00:00, 34.85it/s]
[20 / 35]   Val: Loss = 2.93279, PPX = 18.78: 100%|██████████| 95/95 [00:01<00:00, 78.59it/s]
[21 / 35] Train: Loss = 0.33112, PPX = 1.39:   1%|          | 5/786 [00:00<00:34, 22.71it/s] 


Val BLEU = 16.85


[21 / 35] Train: Loss = 1.54491, PPX = 4.69: 100%|██████████| 786/786 [00:21<00:00, 36.11it/s]
[21 / 35]   Val: Loss = 2.92541, PPX = 18.64: 100%|██████████| 95/95 [00:01<00:00, 78.77it/s]
[22 / 35] Train: Loss = 1.48965, PPX = 4.44:   1%|          | 4/786 [00:00<00:34, 22.53it/s]


Val BLEU = 17.28


[22 / 35] Train: Loss = 1.49624, PPX = 4.46: 100%|██████████| 786/786 [00:22<00:00, 35.13it/s]
[22 / 35]   Val: Loss = 2.92588, PPX = 18.65: 100%|██████████| 95/95 [00:01<00:00, 77.22it/s]
[23 / 35] Train: Loss = 1.13635, PPX = 3.12:   1%|          | 4/786 [00:00<00:44, 17.70it/s] 


Val BLEU = 17.51


[23 / 35] Train: Loss = 1.45118, PPX = 4.27: 100%|██████████| 786/786 [00:22<00:00, 34.30it/s]
[23 / 35]   Val: Loss = 2.92855, PPX = 18.70: 100%|██████████| 95/95 [00:01<00:00, 75.77it/s]
[24 / 35] Train: Loss = 1.94233, PPX = 6.98:   1%|          | 5/786 [00:00<00:30, 25.32it/s]


Val BLEU = 17.48


[24 / 35] Train: Loss = 1.40751, PPX = 4.09: 100%|██████████| 786/786 [00:22<00:00, 34.76it/s]
[24 / 35]   Val: Loss = 2.91964, PPX = 18.53: 100%|██████████| 95/95 [00:01<00:00, 77.46it/s]
[25 / 35] Train: Loss = 1.37768, PPX = 3.97:   1%|          | 4/786 [00:00<00:41, 18.93it/s] 


Val BLEU = 17.75


[25 / 35] Train: Loss = 1.36973, PPX = 3.93: 100%|██████████| 786/786 [00:22<00:00, 34.56it/s]
[25 / 35]   Val: Loss = 2.92584, PPX = 18.65: 100%|██████████| 95/95 [00:01<00:00, 74.44it/s]
[26 / 35] Train: Loss = 2.89522, PPX = 18.09:   1%|          | 5/786 [00:00<00:32, 23.93it/s]


Val BLEU = 17.74


[26 / 35] Train: Loss = 1.33277, PPX = 3.79: 100%|██████████| 786/786 [00:23<00:00, 33.17it/s]
[26 / 35]   Val: Loss = 2.93439, PPX = 18.81: 100%|██████████| 95/95 [00:01<00:00, 75.15it/s]
[27 / 35] Train: Loss = 1.86213, PPX = 6.44:   1%|          | 4/786 [00:00<00:43, 18.17it/s]


Val BLEU = 18.06


[27 / 35] Train: Loss = 1.29702, PPX = 3.66: 100%|██████████| 786/786 [00:23<00:00, 33.04it/s]
[27 / 35]   Val: Loss = 2.94069, PPX = 18.93: 100%|██████████| 95/95 [00:01<00:00, 74.98it/s]
[28 / 35] Train: Loss = 0.98051, PPX = 2.67:   1%|          | 4/786 [00:00<00:41, 18.79it/s]


Val BLEU = 17.60


[28 / 35] Train: Loss = 1.26560, PPX = 3.55: 100%|██████████| 786/786 [00:23<00:00, 34.02it/s]
[28 / 35]   Val: Loss = 2.96412, PPX = 19.38: 100%|██████████| 95/95 [00:01<00:00, 78.26it/s]
[29 / 35] Train: Loss = 1.46460, PPX = 4.33:   1%|          | 5/786 [00:00<00:33, 23.39it/s]


Val BLEU = 17.74


[29 / 35] Train: Loss = 1.23461, PPX = 3.44: 100%|██████████| 786/786 [00:22<00:00, 35.06it/s]
[29 / 35]   Val: Loss = 2.96534, PPX = 19.40: 100%|██████████| 95/95 [00:01<00:00, 77.50it/s]
[30 / 35] Train: Loss = 0.94557, PPX = 2.57:   1%|          | 5/786 [00:00<00:34, 22.94it/s]


Val BLEU = 17.67


[30 / 35] Train: Loss = 1.20557, PPX = 3.34: 100%|██████████| 786/786 [00:22<00:00, 34.97it/s]
[30 / 35]   Val: Loss = 2.96851, PPX = 19.46: 100%|██████████| 95/95 [00:01<00:00, 77.98it/s]
[31 / 35] Train: Loss = 0.17147, PPX = 1.19:   1%|          | 5/786 [00:00<00:35, 21.71it/s]


Val BLEU = 18.31


[31 / 35] Train: Loss = 1.17584, PPX = 3.24: 100%|██████████| 786/786 [00:22<00:00, 35.16it/s]
[31 / 35]   Val: Loss = 2.98702, PPX = 19.83: 100%|██████████| 95/95 [00:01<00:00, 79.10it/s]
[32 / 35] Train: Loss = 0.85934, PPX = 2.36:   1%|          | 6/786 [00:00<00:28, 27.36it/s]


Val BLEU = 17.94


[32 / 35] Train: Loss = 1.15108, PPX = 3.16: 100%|██████████| 786/786 [00:22<00:00, 35.12it/s]
[32 / 35]   Val: Loss = 2.99040, PPX = 19.89: 100%|██████████| 95/95 [00:01<00:00, 77.34it/s]
[33 / 35] Train: Loss = 1.63594, PPX = 5.13:   1%|          | 4/786 [00:00<00:42, 18.27it/s]


Val BLEU = 17.97


[33 / 35] Train: Loss = 1.12558, PPX = 3.08: 100%|██████████| 786/786 [00:21<00:00, 35.76it/s]
[33 / 35]   Val: Loss = 2.99083, PPX = 19.90: 100%|██████████| 95/95 [00:01<00:00, 80.52it/s]
[34 / 35] Train: Loss = 0.76019, PPX = 2.14:   1%|          | 6/786 [00:00<00:26, 29.97it/s]


Val BLEU = 18.18


[34 / 35] Train: Loss = 1.10189, PPX = 3.01: 100%|██████████| 786/786 [00:21<00:00, 35.81it/s]
[34 / 35]   Val: Loss = 3.01267, PPX = 20.34: 100%|██████████| 95/95 [00:01<00:00, 77.82it/s]
[35 / 35] Train: Loss = 0.81718, PPX = 2.26:   1%|          | 4/786 [00:00<00:40, 19.45it/s]


Val BLEU = 18.10


[35 / 35] Train: Loss = 1.08140, PPX = 2.95: 100%|██████████| 786/786 [00:21<00:00, 35.83it/s]
[35 / 35]   Val: Loss = 3.03691, PPX = 20.84: 100%|██████████| 95/95 [00:01<00:00, 79.38it/s]



Val BLEU = 17.83


## Про метрики

**PPX** (перплексия) - в литературе, это измерение того, насколько хорошо языковая модель предсказывает выборку. По факту:
$$2^{H(p)} = 2^{\sum_x p(x)\log_2p(x)}$$

**BLEU**  - рассматривает совпадение предсказанных и фактических целевых последовательностей с 
точки зрения n-gramm.

## Inference

In [None]:
def gec(model, source_text, source_field, target_field):
    bos_index = target_field.vocab.stoi['<s>']
    eos_index = target_field.vocab.stoi['</s>']
    
    model.eval()
    with torch.no_grad():
        result = []
        source = source_field.preprocess(source_text)
        inputs = source_field.process([source])
        len_inputs = inputs[1].to(DEVICE)
        input = inputs[0].to(DEVICE)
        
        encoder_output, encoder_hidden = model.encoder(input, len_inputs)
        encoder_mask = torch.zeros_like(input).byte()
        
        output = encoder_output
        hidden = encoder_hidden
        step = LongTensor([[bos_index]])
        
        for _ in range(50):
            step, hidden, _ = model.decoder(step, encoder_output, encoder_mask, encoder_hidden, output, hidden)
            step = step.argmax(-1)
          
            if step.squeeze().item() == eos_index:
                break
            
            result.append(step.item())   
        result = [target_field.vocab.itos[ind] for ind in result]
        return result

In [None]:
text = 'The rich people will buy a car but the poor people always need to use a bus or taxi.'
result = gec(model, text, source_field, target_field)
print(f'Исходная запись: {text}')
print(f'Предсказание: {" ".join(result)}')
print(f'Должно быть: Rich people will buy a car , but poor people always need to use a bus or taxi .')

Исходная запись: The rich people will buy a car but the poor people always need to use a bus or taxi.
Предсказание: people will expensive people every colour because the people will not even change .
Должно быть: Rich people will buy a car , but poor people always need to use a bus or taxi .


Немного переобучил модель, но все равно смешно:

In [None]:
text = 'The rich people will buy a car but the poor people always need to use a bus or taxi.'
result = gec(model, text, source_field, target_field)
print(f'Исходная запись: {text}')
print(f'Предсказание: {" ".join(result)}')
print(f'Должно быть: Rich people will buy a car , but poor people always need to use a bus or taxi .')

Исходная запись: The rich people will buy a car but the poor people always need to use a bus or taxi.
Предсказание: the people should buy people expensive people , the people will buy people expensive people to buy a car , people will change the people need a car .
Должно быть: Rich people will buy a car , but poor people always need to use a bus or taxi .


![эх](https://i.ytimg.com/vi/sruJfntXMYE/hqdefault.jpg)

## Почему так плохо?

Варианты:
1. Я использовал teacher forcing (в качестве выхода на предыдущем шаге декодер принимал всегда правильный токен). Модель короче на хороших входах, а использоваться будет скорее всего на плохом
2. Я реализовал жадный перевод. на каждом шаге предсказывал наиболее вероятный токен, можно докинуть beam search
3. Не смог подкрутить двунаправленность 
4. Ну камон, какая seq2seq с rnn ? Надо чет мощнее.



# Transformer

![transformer](https://hsto.org/webt/59/f0/44/59f04410c0e56192990801.png)  
*From Attention is all you need*

## Данные

In [4]:
train_path = 'seq2seq/data/train.txt'
val_path = 'seq2seq/data/dev.txt'
prepare = PrepareData(train_path, val_path, batch_size=32,
                      batch_first=True, include_lingths=False,
                      init_token=True)
train_iter, test_iter, source_field, target_field = prepare.start()

100%|██████████| 34304/34304 [00:07<00:00, 4648.15it/s]
100%|██████████| 4384/4384 [00:00<00:00, 4403.39it/s]


Train size = 25146
Test size = 3015
Example data =  {'source': ['my', 'town', 'is', 'a', 'medium', 'size', 'city', 'with', 'eighty', 'thousand', 'inhabitants', '.'], 'target': ['my', 'town', 'is', 'a', 'medium', '-', 'sized', 'city', 'with', 'eighty', 'thousand', 'inhabitants', '.']}
Source vocab size = 7568
Target vocab size = 7509


## Модель

Весь Transformer опирается на идею self-attention. Выглядит это так:

![](http://jalammar.github.io/images/t/transformer_self-attention_visualization.png)  
*From [Tensor2Tensor Tutorial](https://colab.research.google.com/github/tensorflow/tensor2tensor/blob/master/tensor2tensor/notebooks/hello_t2t.ipynb)*

Эмбеддинг слова *it* строится как комбинация всех эмбеддингов предложения.

В статье придумали делать такой аттеншен:

$$\mathrm{Attention}(Q, K, V) = \mathrm{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$

Это примерно как dot-attention: запрос (**Q**uery) умножается на ключи (**K**ey) скалярно, затем берется софтмакс - получаются оценки того, насколько интересны разные таймстемпы из значений (**V**alue). 

Например, $\mathrm{emb}(\text{it}) = \mathrm{Attention}(\text{it}, \ldots\text{because it was too tired}, \ldots\text{because it was too tired})$.

Только теперь ещё с параметром $\frac{1}{\sqrt{d_k}}$, где $d_k$ - это размерность ключа. Утверждается, это работает лучше при больших размерностях ключа $d_k$.

![](https://hsto.org/webt/59/f0/44/59f0440f1109b864893781.png)

Важная идея, почему attention (и, главное, self-attention) заработал - использование нескольких голов (multi-head).

Вообще, когда мы делаем attention - мы определяем похожесть ключа и запроса. Многоголовость помогает (должна) определять эту похожесть по разным критериям - синтаксически, семантически и т.д.

Применяется это таким образом:

$$\mathrm{MultiHead}(Q, K, V) = \mathrm{Concat}(\mathrm{head_1}, ...,
\mathrm{head_h})W^O    \\
    \mathrm{head_i} = \mathrm{Attention}(QW^Q_i, KW^K_i, VW^V_i)$$
    
где $W^Q_i \in \mathbb{R}^{d_{model} \times d_k}, W_i^K \in \mathbb{R}^{d_{model} \times d_k}, W^V_i \in \mathbb{R}^{d_{model} \times d_v}, W^O \in \mathbb{R}^{hd_v \times d_{model}}$.

В оригинальной статье использовали $h=8$, $d_k=d_v=d_{\text{model}}/h=64$.

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, hid_dim,  n_heads, dropout, device):
        super().__init__()

        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads

        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)

        self.fc_o = nn.Linear(hid_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, q, k, v, mask=None):
        #q = [batch size, query len, hid dim]
        #k = [batch size, key len, hid dim]
        #v = [batch size, value len, hid dim]
        batch_size = q.shape[0]

        q = self.fc_q(q)
        k = self.fc_k(k)
        v = self.fc_v(v)

        q = q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        k = k.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        v = v.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        #q = [batch size, n heads, query len, head dim]
        #k = [batch size, n heads, key len, head dim]
        #v = [batch size, n heads, value len, head dim]

        dot_product = torch.matmul(q, k.permute(0, 1, 3, 2)) / self.scale
        #dot_product = [batch size, n heads, query len, key len]

        attention = torch.softmax(dot_product, dim=-1)

        x = torch.matmul(self.dropout(attention), v)
        x = x.view(batch_size, -1, self.hid_dim)
        x = self.fc_o(x)
        #x = [batch size, query len, hid dim]
        return x, attention

## Encoder

In [6]:
class FeedForwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        #x = [batch size, seq len, hid dim]
        x = self.fc_1(x)
        x = self.dropout(self.relu(x))

        return self.fc_2(x)

In [7]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()

        self.layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttention(hid_dim, n_heads, dropout, device)
        self.feedforward = FeedForwardLayer(hid_dim, pf_dim, dropout)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, src len]

        _src, _ = self.self_attention(src, src, src, src_mask)
        src = self.layer_norm(src + self.dropout(_src))

        _src = self.feedforward(src)
        src = self.layer_norm(src + self.dropout(_src))

        return src

In [8]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads,
                 pf_dim, dropout, device, max_length = 100):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
            
        return src

## Decoder

In [9]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttention(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttention(hid_dim, n_heads, dropout, device)
        self.feedforward = FeedForwardLayer(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
        
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        trg = self.layer_norm(trg + self.dropout(_trg))
                        
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        #dropout, residual connection and layer norm
        trg = self.layer_norm(trg + self.dropout(_trg))
                    
        _trg = self.feedforward(trg)
        
        trg = self.layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

In [10]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
                
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output, attention

## Полная модель

In [11]:
class Transformer(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool() # tril - возвращет треугольную часть матрицы
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

In [12]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

## Обучение

In [15]:
from seq2seq.utils.utilsTransformer import *

input_dim = len(source_field.vocab)
output_dim = len(target_field.vocab)
hiiden_dim = 256
encoder_layers = 3
decoder_layers = 3
encoder_heads = 8
decoder_heads = 8
encoder_pf_dim = 512
decoder_pf_dim = 512
encoder_dropout = 0.1
decoder_dropout = 0.1

enc = Encoder(input_dim, hiiden_dim, encoder_layers,
              encoder_heads, encoder_pf_dim, encoder_dropout,
              DEVICE)

dec = Decoder(output_dim, hiiden_dim, decoder_layers,
              decoder_heads, decoder_pf_dim, decoder_dropout,
              DEVICE)

SRC_PAD_IDX = source_field.vocab.stoi[source_field.pad_token]
TRG_PAD_IDX = target_field.vocab.stoi[target_field.pad_token]

model = Transformer(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE)

model.apply(initialize_weights);

pad_idx = target_field.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=0.0005)

out = fit(model, criterion, optimizer, train_iter, epochs_count=5, val_iter=test_iter)

[1 / 5] Train: Loss = 3.03333, PPX = 20.77: 100%|██████████| 786/786 [00:31<00:00, 24.97it/s]
[1 / 5]   Val: Loss = 0.81273, PPX = 2.25: 100%|██████████| 95/95 [00:01<00:00, 93.40it/s]
[2 / 5] Train: Loss = 0.60465, PPX = 1.83: 100%|██████████| 786/786 [00:30<00:00, 26.09it/s]
[2 / 5]   Val: Loss = 0.27219, PPX = 1.31: 100%|██████████| 95/95 [00:01<00:00, 91.68it/s]
[3 / 5] Train: Loss = 0.27155, PPX = 1.31: 100%|██████████| 786/786 [00:29<00:00, 26.26it/s]
[3 / 5]   Val: Loss = 0.12393, PPX = 1.13: 100%|██████████| 95/95 [00:00<00:00, 97.10it/s]
[4 / 5] Train: Loss = 0.13670, PPX = 1.15: 100%|██████████| 786/786 [00:30<00:00, 26.05it/s]
[4 / 5]   Val: Loss = 0.05904, PPX = 1.06: 100%|██████████| 95/95 [00:00<00:00, 95.32it/s]
[5 / 5] Train: Loss = 0.07777, PPX = 1.08: 100%|██████████| 786/786 [00:29<00:00, 26.45it/s]
[5 / 5]   Val: Loss = 0.03317, PPX = 1.03: 100%|██████████| 95/95 [00:01<00:00, 91.94it/s]


In [16]:
print(out)

tensor([[ 7.4964, -0.8494, -0.7168,  ..., -0.0224,  0.3387, -1.6041],
        [ 3.1933, -0.6607, -0.2058,  ..., -1.6023,  0.3649, -1.1363],
        [ 2.2783, -1.2862, -1.2607,  ..., -4.7473, -4.2382, -2.4925],
        ...,
        [ 7.1153, -0.5686,  0.1196,  ..., -0.4088, -2.7209, -0.6260],
        [ 3.0427, -1.6337, -1.7286,  ...,  0.9973, -1.2783,  0.1864],
        [ 4.6235, -2.2453, -2.8410,  ...,  0.8970,  1.1595,  0.3482]],
       device='cuda:0')


In [19]:
def gec_transformer(model, sentence, source_field, target_field):
    bos_index = target_field.vocab.stoi['<s>']
    eos_index = target_field.vocab.stoi['</s>']

    model.eval()
    with torch.no_grad():
        result, attention = [], []
        source = source_field.preprocess(sentence)
        inputs = source_field.process([source]).to(DEVICE)

        src_mask = model.make_src_mask(inputs)
        enc_src = model.encoder(inputs, src_mask)

        step = [bos_index]

        for _ in range(50):
            step = LongTensor(step).unsqueeze(0).to(DEVICE)
            trg_mask = model.make_trg_mask(step)
            step, _ = model.decoder(step, enc_src, trg_mask, src_mask)
            step = step.argmax(2)[:,-1].item()
            result.append(step)

            if step == eos_index:
                break

        
        result = [target_field.vocab.itos[ind] for ind in result]
        return result

In [20]:
text = 'The rich people will buy a car but the poor people always need to use a bus or taxi.'
result = gec_transformer(model, text, source_field, target_field)
print(f'Исходная запись: {text}')
print(f'Предсказание: {" ".join(result)}')
print(f'Должно быть: Rich people will buy a car , but poor people always need to use a bus or taxi .')

RuntimeError: ignored