In [1]:
def imports():
    global math, np, pd, random, json, torch, Dataset, DataLoader, tqdm, plt, yttm
    
    import math
    import numpy as np
    import pandas as pd

    import random
    import json
    import torch
    from torch.utils.data import Dataset, DataLoader

    from tqdm import tqdm

    from matplotlib import pyplot as plt

    import youtokentome as yttm

In [2]:
imports()

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
DEVICE

device(type='cuda')

### Подготовка данных

In [2]:
qa_data = list()

with open('qa_data.jsonl') as file_object:
    for line in file_object:
        qa_data.append(json.loads(line.strip()))

NameError: name 'json' is not defined

In [10]:
from collections import deque

questin_answer_data = []
for question_answer in qa_data:
    questin_answer_data.append(question_answer['question'])
    deque(map(questin_answer_data.append, question_answer['responses']))

In [11]:
questin_answer_data[:5]

['долго ли идут деньги с яндексденег на карту visa?',
 'нет. прорыв 35 ;)',
 'можно ли зарегистрировать авто в другом регионе',
 'можно на родственника из того региона.. .  а потом ездить по доверке',
 'что делать если у меня очень тонкие ногти а хочется их отрастить?']

In [12]:
with open('for_bpe.txt', 'w') as f:
    f.write('\n'.join(questin_answer_data))

In [13]:
del questin_answer_data

In [14]:
!head for_bpe.txt

долго ли идут деньги с яндексденег на карту visa?
нет. прорыв 35 ;)
можно ли зарегистрировать авто в другом регионе
можно на родственника из того региона.. .  а потом ездить по доверке
что делать если у меня очень тонкие ногти а хочется их отрастить?
витамины и умная эмаль (каждый день)
ванночки с морской солью. с вечера мажь ногти сверху йодом. не бойся, до утра все впитается.
умная эмаль, витамины, йод, и поменьше крась лаком 
лаки фирмы trind производство usa + кальций
в чем отличие медитации от йоги?


In [25]:
VOCAB_SIZE = 30_000
MODEL_PATH = 'pretrained_bpe_lm.model'

In [16]:
yttm.BPE.train(data='for_bpe.txt', vocab_size=VOCAB_SIZE, model=MODEL_PATH)

<youtokentome.youtokentome.BPE at 0x7f6155470f98>

In [17]:
tokenizer = yttm.BPE(model=MODEL_PATH)

In [18]:
questions = []
answers = []

for qa in qa_data:
    for answer in qa['responses']:
        questions.append(qa['question'])
        answers.append(answer)

In [19]:
del qa_data

In [23]:
batch_size = 256
tokenized_questions = []

for i_batch in tqdm(range(math.ceil(len(questions) / batch_size))):
    tokenized_questions.extend(
        tokenizer.encode(
            list(questions[i_batch*batch_size:(i_batch+1)*batch_size]),
            bos=True, eos=False,
        )
    )

100%|██████████| 30341/30341 [01:07<00:00, 448.48it/s]


In [24]:
# как сложно без gc
del questions

In [25]:
tokenized_answers = []

for i_batch in tqdm(range(math.ceil(len(answers) / batch_size))):
    tokenized_answers.extend(
        tokenizer.encode(
            list(answers[i_batch*batch_size:(i_batch+1)*batch_size]),
            bos=True, eos=False,
        )
    )

100%|██████████| 30341/30341 [01:03<00:00, 480.40it/s]


In [26]:
del answers

In [29]:
# у меня не хватает памяти, лучше сохраниться
import pickle

with open('questions', 'wb') as f:
    pickle.dump(tokenized_questions, f)

with open('answers', 'wb') as f:
    pickle.dump(tokenized_answers, f)

In [30]:
del tokenized_questions
del tokenized_answers

### Датасет

In [5]:
import pickle

# будем брать одну десятую датасета, иначе памяти не хватит
with open('questions', 'rb') as f:
    questions = pickle.load(f)
questions = questions[:int(len(questions)/10)]

In [6]:
with open('answers', 'rb') as f:
    answers = pickle.load(f)
answers = answers[:int(len(answers)/10)]

In [7]:
print(len(questions), len(answers))
assert len(questions) == len(answers)

776713 776713


In [8]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:          14961        7587        4368         634        3005        6409
Swap:         16134        2317       13817


In [9]:
imports()

In [10]:
PAD_INDEX = 0
EOS_INDEX = 3
VOCAB_SIZE = 30_000

In [11]:
class SequenceBucketingData(torch.utils.data.Dataset):
    """по сути то же, что в условии, только другие сиквенсы"""
    def __init__(self, questions, answers, max_len, pad_index=PAD_INDEX, eos_index=EOS_INDEX):
        self.questions = questions
        self.answers = answers
        if len(questions) != len(answers):
            raise ValueError('Вопросы и ответы должны быть одной длины')
        self.max_len = max_len
        self.pad_index = pad_index
        self.eos_index = eos_index
        
    def __len__(self):
        return len(self.questions)
    
    def _prepare_sample(self, sequence_q, sequence_a, max_len_q, max_len_a):
        sequence_q = sequence_q[:max_len_q]
        sequence_a = sequence_a[:max_len_a]
        x = sequence_q
        y = sequence_a
        pads_x = [self.pad_index] * (max_len_q - len(x))
        pads_y = [self.pad_index] * (max_len_a - len(y))
        x += pads_x
        y += pads_y
        return x, y
    
    def __getitem__(self, index):
        batch_q = self.questions[index]
        batch_a = self.answers[index]
        max_len_q = min([
            self.max_len,
            max(map(len, batch_q)),
        ])
        max_len_a = min([
            self.max_len,
            max(map(len, batch_a)),
        ])
        batch_x = []
        batch_y = []
        for sample_q, sample_a in zip(batch_q, batch_a):
            x, y = self._prepare_sample(sample_q, sample_a, self.max_len, self.max_len)
            batch_x.append(x)
            batch_y.append(y)
        batch_x = torch.tensor(batch_x).long()#.to(DEVICE)
        batch_y = torch.tensor(batch_y).long()#.to(DEVICE)
        return batch_x, batch_y

In [12]:
questions = sorted(questions, key=len)
answers = sorted(answers, key=len)

In [13]:
# сделаем батч побольше
BATCH_SIZE = 64
MAX_LEN = 32

In [14]:
batches_q = []
batches_a = []

for i_batch in range(math.ceil(len(questions) / BATCH_SIZE)):
    q = questions[i_batch*BATCH_SIZE:(i_batch+1)*BATCH_SIZE]
    a = answers[i_batch*BATCH_SIZE:(i_batch+1)*BATCH_SIZE]
    if len(q) != BATCH_SIZE or len(a) != BATCH_SIZE:
        continue
    batches_q.append(questions[i_batch*BATCH_SIZE:(i_batch+1)*BATCH_SIZE])
    batches_a.append(answers[i_batch*BATCH_SIZE:(i_batch+1)*BATCH_SIZE])

In [15]:
# у нас же нет гарантии, что количество вопросов делится на батч-сайз
# мы лишнее убрали условием сверху, надо убедиться
all(
    map(lambda b: True if len(b) == BATCH_SIZE else False, batches_q)
)

True

In [16]:
validation_start_index = int(len(batches_q) * 0.05)

In [17]:
train_seq = SequenceBucketingData(
    questions=batches_q[:-validation_start_index],
    answers=batches_a[:-validation_start_index],
    max_len=MAX_LEN)
test_seq = SequenceBucketingData(
    questions=batches_q[-validation_start_index:],
    answers=batches_a[-validation_start_index:],
    max_len=MAX_LEN)

In [18]:
train_loader = torch.utils.data.DataLoader(train_seq, batch_size=None, batch_sampler=None)
validation_loader = torch.utils.data.DataLoader(test_seq, batch_size=BATCH_SIZE)

### Модель

In [19]:
# это специальный дропаут для реккуретных сетей
# хорошо это объясняется здесь: https://youtu.be/WLaAIYQHHMU?t=1093

class SpatialDropout(torch.nn.Dropout2d):
    
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
    
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T)
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

In [38]:
HIDDEN_SIZE = 300

In [71]:
# сильно переписанный вариант из документации
class Encoder(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        # батч идёт первым, нуже батчфёрст
        self.gru = torch.nn.GRU(hidden_size, hidden_size, batch_first=True)

    def forward(self, x):
        # x - [batch_size, max_len] 
        embedded = self.embedding(x)
        # embedded - [batch_size, max_len, 300]
        _, hidden = self.gru(embedded)
        # hidden - [1, batch_size, hidden_size]
        return hidden


In [141]:
# тоже сильно переписанный вариант из документации
class Decoder(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(1)

    def forward(self, x, hidden):
        # x - [batch_size]
        # hidden - [1, batch_size, hidden_size]
        x = x.long() # торч не всегда маг с `double` и `long int` в CUDA
        embedded = self.embedding(x)
        # embedded - [batch_size, hidden_size]
        embedded = embedded.unsqueeze(1)
        print(embedded.shape)
        # embedded - [batch_size, 1, hidden_size]
        outputs, hidden = self.gru(embedded, hidden)
        # outputs, hidden - [1, max_len, 300], 
        outputs = self.out(outputs[0])
        # outputs - [max_len, vocab_size]
        output = self.softmax(outputs)
        return output, hidden

    def init_hidden(self):
        """Запускаем декодер с EOS
        
        Зачем нам BOS, если уже есть EOS, логично?
        """
        hidden_zeros = torch.zeros(BATCH_SIZE)
        hidden_eos = hidden_zeros + EOS_INDEX
        return hidden_eos#, device=DEVICE)

In [142]:
class Seq2Seq(torch.nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target):
        outputs = torch.zeros(BATCH_SIZE, VOCAB_SIZE)#.to(DEVICE)
        # outputs - [batch_size, vocab_size]
        hidden = self.encoder(source)
        # hidden - [1, batch_size, hidden_size]
        
        x = decoder.init_hidden()
        # x - [batch_size]
        for t in range(1, BATCH_SIZE):
            output, hidden = self.decoder(x, hidden)
            outputs[t] = output
            x = output.argmax(1)
        return outputs
        

In [143]:
encoder = Encoder(VOCAB_SIZE, HIDDEN_SIZE)
decoder = Decoder(HIDDEN_SIZE, VOCAB_SIZE)
seq2seq = Seq2Seq(encoder, decoder)

In [144]:
def train(
    train_loader, test_loader,
    seq2seq,
    encoder_criterion=torch.nn.CrossEntropyLoss(ignore_index=PAD_INDEX),
    decoder_criterion=torch.nn.CrossEntropyLoss(ignore_index=PAD_INDEX),
    encoder_optimizer=torch.optim.Adam(params=encoder.parameters()),
    decoder_optimizer=torch.optim.Adam(params=decoder.parameters()),
):
    for e in range(10):
        print(f'Epoch {e}')
        seq2seq.train()
        for x, y in train_loader:
#             x = x.to(DEVICE)
            # print(x.shape, 1)
            output = seq2seq.forward(x, y)
    

In [145]:
train(train_loader, None, seq2seq)

Epoch 0
torch.Size([64, 1, 300])
300


RuntimeError: Expected hidden size (1, 1, 300), got [1, 64, 300]

In [65]:
# почему я не люблю ноутбуки
import gc; gc.collect()
del seq2seq
del encoder
del decoder

In [46]:
encoder_criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_INDEX)
decoder_criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_INDEX)
encoder_optimizer = torch.optim.Adam(params=encoder.parameters())
decoder_optimizer = torch.optim.Adam(params=decoder.parameters())

In [54]:
x = torch.FloatTensor([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
torch.softmax(x, 1)

tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652]])

In [65]:
x = torch.zeros(BATCH_SIZE)
x + EOS_INDEX

tensor([3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
        3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
        3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
        3., 3., 3., 3., 3., 3., 3., 3., 3., 3.])