In [1]:
# Yernar Shambayev, DL-2
# Реализуйте задачу машинного перевода с использованием transformer. Датасет: 
# http://www.manythings.org/anki/

import torch
from torch import nn
from torch.utils.data import SubsetRandomSampler
import numpy as np
import re

In [2]:
# Возьмем англо-белорусский корпус

with open("bel.txt") as f:
    sentences = f.readlines()
print(f'Всего предложений: {len(sentences)}')

Всего предложений: 3812


In [3]:
# Препроцессинг

MAX_NUM = 10000
MAX_SENT_LEN = 10
eng_sentences, bel_sentences = [], []
eng_words, bel_words = set(), set()

for i in range(MAX_NUM):
    rand_idx = np.random.randint(len(sentences))

    eng_sent, bel_sent = ["<sos>"], ["<sos>"]
    eng_sent += re.findall(r"\w+", sentences[rand_idx].split("\t")[0]) 
    bel_sent += re.findall(r"\w+", sentences[rand_idx].split("\t")[1])

    eng_sent = [x.lower() for x in eng_sent]
    bel_sent = [x.lower() for x in bel_sent]
    eng_sent.append("<eos>")
    bel_sent.append("<eos>")

    if len(eng_sent) >= MAX_SENT_LEN:
        eng_sent = eng_sent[:MAX_SENT_LEN]
    else:
        for _ in range(MAX_SENT_LEN - len(eng_sent)):
            eng_sent.append("<pad>")

    if len(bel_sent) >= MAX_SENT_LEN:
        bel_sent = bel_sent[:MAX_SENT_LEN]
    else:
        for _ in range(MAX_SENT_LEN - len(bel_sent)):
            bel_sent.append("<pad>")

    eng_sentences.append(eng_sent)
    bel_sentences.append(bel_sent)

    eng_words.update(eng_sent)
    bel_words.update(bel_sent)

eng_words, bel_words = list(eng_words), list(bel_words)

for i in range(len(eng_sentences)):
    eng_sentences[i] = [eng_words.index(x) for x in eng_sentences[i]]
    bel_sentences[i] = [bel_words.index(x) for x in bel_sentences[i]]

for i in range(5):
    print(eng_sentences[i])
    print([eng_words[x] for x in eng_sentences[i]])
    print(bel_sentences[i])
    print([bel_words[x] for x in bel_sentences[i]])
    print()

[1921, 80, 1640, 857, 2206, 788, 1946, 1037, 1037, 1037]
['<sos>', 'you', 're', 'the', 'tallest', 'one', '<eos>', '<pad>', '<pad>', '<pad>']
[3477, 3779, 2749, 3507, 1920, 1920, 1920, 1920, 1920, 1920]
['<sos>', 'ты', 'найвышэйшы', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']

[1921, 1361, 2133, 80, 1640, 688, 1946, 1037, 1037, 1037]
['<sos>', 'i', 'know', 'you', 're', 'hungry', '<eos>', '<pad>', '<pad>', '<pad>']
[3477, 2896, 3893, 2279, 3779, 142, 3507, 1920, 1920, 1920]
['<sos>', 'я', 'ведаю', 'што', 'ты', 'галодная', '<eos>', '<pad>', '<pad>', '<pad>']

[1921, 1660, 754, 1946, 1037, 1037, 1037, 1037, 1037, 1037]
['<sos>', 'call', 'us', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
[3477, 679, 2404, 3507, 1920, 1920, 1920, 1920, 1920, 1920]
['<sos>', 'пакліч', 'нас', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']

[1921, 62, 2102, 1451, 1200, 857, 513, 952, 390, 857]
['<sos>', 'millions', 'of', 'people', 'across', 'the', 'world', 'a

In [4]:
# Устанавливаем гиперпараметры

ENG_VOCAB_SIZE = len(eng_words)
BEL_VOCAB_SIZE = len(bel_words)
NUM_EPOCHS = 10
HIDDEN_SIZE = 16
EMBEDDING_DIM = 30
BATCH_SIZE = 128
NUM_HEADS = 2
NUM_LAYERS = 3
LEARNING_RATE = 0.01
DROPOUT = 0.3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


In [5]:
# Загружаем в датасет

class Vocab_Dataset(torch.utils.data.Dataset):
  def __init__(self):  
    self.source = np.array(eng_sentences, dtype = int)
    self.target = np.array(bel_sentences, dtype = int)
    
  def __getitem__(self, idx):
    return self.source[idx], self.target[idx]
  
  def __len__(self):
    return len(self.source)

np.random.seed(777)   
dataset = Vocab_Dataset()
NUM_INSTANCES = len(dataset)
TEST_RATIO = 0.3
TEST_SIZE = int(NUM_INSTANCES * 0.3)

indices = list(range(NUM_INSTANCES))

test_idx = np.random.choice(indices, size = TEST_SIZE, replace = False)
train_idx = list(set(indices) - set(test_idx))
train_sampler, test_sampler = SubsetRandomSampler(train_idx), SubsetRandomSampler(test_idx)

train_loader = torch.utils.data.DataLoader(dataset, batch_size = BATCH_SIZE, sampler = train_sampler)
test_loader = torch.utils.data.DataLoader(dataset, batch_size = BATCH_SIZE, sampler = test_sampler)

In [6]:
# Взято: https://pytorch.org/tutorials/beginner/transformer_tutorial.html

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [7]:
# Сам трансформер

class TransformerNet(nn.Module):
  def __init__(self, num_src_vocab, num_tgt_vocab, embedding_dim, hidden_size, nheads, n_layers, max_src_len, max_tgt_len, dropout):
    super(TransformerNet, self).__init__()

    self.enc_embedding = nn.Embedding(num_src_vocab, embedding_dim)
    self.dec_embedding = nn.Embedding(num_tgt_vocab, embedding_dim)

    self.enc_pe = PositionalEncoding(embedding_dim, max_len = max_src_len)
    self.dec_pe = PositionalEncoding(embedding_dim, max_len = max_tgt_len)

    enc_layer = nn.TransformerEncoderLayer(embedding_dim, nheads, hidden_size, dropout)
    dec_layer = nn.TransformerDecoderLayer(embedding_dim, nheads, hidden_size, dropout)
    self.encoder = nn.TransformerEncoder(enc_layer, num_layers = n_layers)
    self.decoder = nn.TransformerDecoder(dec_layer, num_layers = n_layers)

    self.dense = nn.Linear(embedding_dim, num_tgt_vocab)
    self.log_softmax = nn.LogSoftmax()

  def forward(self, src, tgt):
    src, tgt = self.enc_embedding(src).permute(1, 0, 2), self.dec_embedding(tgt).permute(1, 0, 2)
    src, tgt = self.enc_pe(src), self.dec_pe(tgt)
    memory = self.encoder(src)
    transformer_out = self.decoder(tgt, memory)
    final_out = self.dense(transformer_out)
    return self.log_softmax(final_out)

In [8]:
model = TransformerNet(ENG_VOCAB_SIZE, BEL_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, NUM_HEADS, NUM_LAYERS, MAX_SENT_LEN, MAX_SENT_LEN, DROPOUT).to(DEVICE)
loss = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [9]:
# Обучение и тест

for epoch in range(NUM_EPOCHS):
    for mode in ['train', 'test']:
        if mode == 'train':
            model.train()
            my_loader = train_loader
        else:
            model.eval()
            my_loader = test_loader

        current_loss = 0
        for i, (x, y) in enumerate(my_loader):
            x, y  = x.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()

            with torch.set_grad_enabled(mode == 'train'):
                outputs = model(x, y)
                l = loss(outputs.permute(1, 2, 0), y)

                if mode == 'train':
                    l.backward()
                    optimizer.step()

            current_loss += l.item()

        print(f'Эпоха: {epoch+1}, режим: {mode}, потери: {current_loss:.4f}')



Эпоха: 1, режим: train, потери: 58.1192
Эпоха: 1, режим: test, потери: 15.4221
Эпоха: 2, режим: train, потери: 31.8015
Эпоха: 2, режим: test, потери: 13.6195
Эпоха: 3, режим: train, потери: 29.6243
Эпоха: 3, режим: test, потери: 13.4845
Эпоха: 4, режим: train, потери: 29.3206
Эпоха: 4, режим: test, потери: 13.4065
Эпоха: 5, режим: train, потери: 29.2327
Эпоха: 5, режим: test, потери: 13.4727
Эпоха: 6, режим: train, потери: 29.1847
Эпоха: 6, режим: test, потери: 13.4631
Эпоха: 7, режим: train, потери: 29.1586
Эпоха: 7, режим: test, потери: 13.4997
Эпоха: 8, режим: train, потери: 29.1222
Эпоха: 8, режим: test, потери: 13.4521
Эпоха: 9, режим: train, потери: 29.1344
Эпоха: 9, режим: test, потери: 13.4406
Эпоха: 10, режим: train, потери: 29.1310
Эпоха: 10, режим: test, потери: 13.4744
