In [1]:
def imports():
    global math, np, pd, random, json, torch, Dataset, DataLoader, tqdm, plt, yttm
    
    import math
    import numpy as np
    import pandas as pd

    import random
    import json
    import torch
    from torch.utils.data import Dataset, DataLoader

    from tqdm import tqdm

    from matplotlib import pyplot as plt

    import youtokentome as yttm

In [2]:
imports()

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
DEVICE

device(type='cuda')

### Подготовка данных

In [2]:
qa_data = list()

with open('qa_data.jsonl') as file_object:
    for line in file_object:
        qa_data.append(json.loads(line.strip()))

NameError: name 'json' is not defined

In [10]:
from collections import deque

questin_answer_data = []
for question_answer in qa_data:
    questin_answer_data.append(question_answer['question'])
    deque(map(questin_answer_data.append, question_answer['responses']))

In [11]:
questin_answer_data[:5]

['долго ли идут деньги с яндексденег на карту visa?',
 'нет. прорыв 35 ;)',
 'можно ли зарегистрировать авто в другом регионе',
 'можно на родственника из того региона.. .  а потом ездить по доверке',
 'что делать если у меня очень тонкие ногти а хочется их отрастить?']

In [12]:
with open('for_bpe.txt', 'w') as f:
    f.write('\n'.join(questin_answer_data))

In [13]:
del questin_answer_data

In [14]:
!head for_bpe.txt

долго ли идут деньги с яндексденег на карту visa?
нет. прорыв 35 ;)
можно ли зарегистрировать авто в другом регионе
можно на родственника из того региона.. .  а потом ездить по доверке
что делать если у меня очень тонкие ногти а хочется их отрастить?
витамины и умная эмаль (каждый день)
ванночки с морской солью. с вечера мажь ногти сверху йодом. не бойся, до утра все впитается.
умная эмаль, витамины, йод, и поменьше крась лаком 
лаки фирмы trind производство usa + кальций
в чем отличие медитации от йоги?


In [25]:
VOCAB_SIZE = 30_000
MODEL_PATH = 'pretrained_bpe_lm.model'

In [16]:
yttm.BPE.train(data='for_bpe.txt', vocab_size=VOCAB_SIZE, model=MODEL_PATH)

<youtokentome.youtokentome.BPE at 0x7f6155470f98>

In [17]:
tokenizer = yttm.BPE(model=MODEL_PATH)

In [18]:
questions = []
answers = []

for qa in qa_data:
    for answer in qa['responses']:
        questions.append(qa['question'])
        answers.append(answer)

In [19]:
del qa_data

In [23]:
batch_size = 256
tokenized_questions = []

for i_batch in tqdm(range(math.ceil(len(questions) / batch_size))):
    tokenized_questions.extend(
        tokenizer.encode(
            list(questions[i_batch*batch_size:(i_batch+1)*batch_size]),
            bos=True, eos=False,
        )
    )

100%|██████████| 30341/30341 [01:07<00:00, 448.48it/s]


In [24]:
# как сложно без gc
del questions

In [25]:
tokenized_answers = []

for i_batch in tqdm(range(math.ceil(len(answers) / batch_size))):
    tokenized_answers.extend(
        tokenizer.encode(
            list(answers[i_batch*batch_size:(i_batch+1)*batch_size]),
            bos=True, eos=False,
        )
    )

100%|██████████| 30341/30341 [01:03<00:00, 480.40it/s]


In [26]:
del answers

In [29]:
# у меня не хватает памяти, лучше сохраниться
import pickle

with open('questions', 'wb') as f:
    pickle.dump(tokenized_questions, f)

with open('answers', 'wb') as f:
    pickle.dump(tokenized_answers, f)

In [30]:
del tokenized_questions
del tokenized_answers

### Датасет

In [5]:
import pickle

# будем брать одну десятую датасета, иначе памяти не хватит
with open('questions', 'rb') as f:
    questions = pickle.load(f)
questions = questions[:int(len(questions)/20)]

In [6]:
with open('answers', 'rb') as f:
    answers = pickle.load(f)
answers = answers[:int(len(answers)/20)]

In [7]:
print(len(questions), len(answers))
assert len(questions) == len(answers)

388356 388356


In [8]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:          14961        7044        5526         558        2390        7048
Swap:         16134        2867       13267


In [9]:
imports()

In [10]:
PAD_INDEX = 0
EOS_INDEX = 3
VOCAB_SIZE = 30_000

In [11]:
class SequenceBucketingData(torch.utils.data.Dataset):
    """по сути то же, что в условии, только другие сиквенсы"""
    def __init__(self, questions, answers, max_len, pad_index=PAD_INDEX, eos_index=EOS_INDEX):
        self.questions = questions
        self.answers = answers
        if len(questions) != len(answers):
            raise ValueError('Вопросы и ответы должны быть одной длины')
        self.max_len = max_len
        self.pad_index = pad_index
        self.eos_index = eos_index
        
    def __len__(self):
        return len(self.questions)
    
    def _prepare_sample(self, sequence_q, sequence_a, max_len_q, max_len_a):
        sequence_q = sequence_q[:max_len_q]
        sequence_a = sequence_a[:max_len_a]
        x = sequence_q
        y = sequence_a
        pads_x = [self.pad_index] * (max_len_q - len(x))
        pads_y = [self.pad_index] * (max_len_a - len(y))
        x += pads_x
        y += pads_y
        return x, y
    
    def __getitem__(self, index):
        batch_q = self.questions[index]
        batch_a = self.answers[index]
        max_len_q = min([
            self.max_len,
            max(map(len, batch_q)),
        ])
        max_len_a = min([
            self.max_len,
            max(map(len, batch_a)),
        ])
        batch_x = []
        batch_y = []
        for sample_q, sample_a in zip(batch_q, batch_a):
            x, y = self._prepare_sample(sample_q, sample_a, self.max_len, self.max_len)
            batch_x.append(x)
            batch_y.append(y)
        batch_x = torch.tensor(batch_x).long().to(DEVICE)
        batch_y = torch.tensor(batch_y).long().to(DEVICE)
        return batch_x, batch_y

In [12]:
questions = sorted(questions, key=len)
answers = sorted(answers, key=len)

In [13]:
# сделаем батч побольше
BATCH_SIZE = 64
MAX_LEN = 32

In [14]:
batches_q = []
batches_a = []
i = 0
for i_batch in range(math.ceil(len(questions) / BATCH_SIZE)):
    q = questions[i_batch*BATCH_SIZE:(i_batch+1)*BATCH_SIZE]
    a = answers[i_batch*BATCH_SIZE:(i_batch+1)*BATCH_SIZE]
    if len(q) != BATCH_SIZE or len(a) != BATCH_SIZE:
        continue
    batches_q.append(questions[i_batch*BATCH_SIZE:(i_batch+1)*BATCH_SIZE])
    batches_a.append(answers[i_batch*BATCH_SIZE:(i_batch+1)*BATCH_SIZE])
    if i > 0:
        break
    i += 1

In [15]:
len(batches_q)

2

In [16]:
# у нас же нет гарантии, что количество вопросов делится на батч-сайз
# мы лишнее убрали условием сверху, надо убедиться
all(
    map(lambda b: True if len(b) == BATCH_SIZE else False, batches_q)
)

True

In [17]:
validation_start_index = int(len(batches_q) * 0.05)

In [18]:
train_seq = SequenceBucketingData(
    questions=batches_q[:-validation_start_index],
    answers=batches_a[:-validation_start_index],
    max_len=MAX_LEN)
test_seq = SequenceBucketingData(
    questions=batches_q[-validation_start_index:],
    answers=batches_a[-validation_start_index:],
    max_len=MAX_LEN)

In [19]:
train_seq = SequenceBucketingData(batches_q, batches_a, MAX_LEN)
len(train_seq)

2

In [20]:
train_loader = torch.utils.data.DataLoader(train_seq, batch_size=None, batch_sampler=None)
validation_loader = torch.utils.data.DataLoader(test_seq, batch_size=None, batch_sampler=None)

In [21]:
for x, y in train_loader:
    print(x.shape)

torch.Size([64, 32])
torch.Size([64, 32])


### Модель

In [22]:
# это специальный дропаут для реккуретных сетей
# хорошо это объясняется здесь: https://youtu.be/WLaAIYQHHMU?t=1093

class SpatialDropout(torch.nn.Dropout2d):
    
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
    
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T)
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

In [None]:
HIDDEN_SIZE = 300
LR = 0.01

In [None]:
# сильно переписанный вариант из документации
class Encoder(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        # батч идёт первым, нуже батчфёрст
        self.gru = torch.nn.GRU(hidden_size, hidden_size, batch_first=True)

    def forward(self, x):
        # x - [batch_size, max_len] 
        embedded = self.embedding(x)
        # embedded - [batch_size, max_len, 300]
        _, hidden = self.gru(embedded)
        # hidden - [1, batch_size, hidden_size]
        return hidden


In [None]:
# тоже сильно переписанный вариант из документации
class Decoder(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(-1)

    def forward(self, x, hidden):
        # x - [batch_size]
        # hidden - [1, batch_size, hidden_size]
        x = x.long() # торч не всегда маг с `double` и `long int` в CUDA
        embedded = self.embedding(x)
        # embedded - [batch_size, hidden_size]
        embedded = embedded.unsqueeze(1)
        # embedded - [batch_size, 1, hidden_size]
        _, hidden = self.gru(embedded, hidden)
        # hidden - [1, batch_size, hidden_size]
        predictions = self.out(hidden)
        # predictions - [1, batch_size, vocab_size]
        predictions = self.softmax(predictions)
        # predictions - [1, batch_size, vocab_size]
        return predictions, hidden

    def init_input(self):
        """Запускаем декодер с EOS
        
        Зачем нам BOS, если уже есть EOS, логично?
        """
        input_zeros = torch.zeros(BATCH_SIZE, device=DEVICE)
        input_eos = input_zeros + EOS_INDEX
        return input_eos

In [None]:
class Seq2Seq(torch.nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target):
        outputs = torch.zeros(MAX_LEN, BATCH_SIZE, VOCAB_SIZE)#.to(DEVICE)
        # outputs - [max_len, batch_size, vocab_size]
        hidden = self.encoder(source)
        # hidden - [1, batch_size, hidden_size]
        
        x = decoder.init_input()
        # x - [batch_size]
        for t in range(1, MAX_LEN):
            predictions, hidden = self.decoder(x, hidden)
            prediction = predictions.argmax(-1)
            # prediction - [1, batch_size]
            prediction = prediction.squeeze(0)
            # prediction - [batch_size]
            x = prediction
            outputs[t] = predictions
        return outputs
        

In [None]:
encoder = Encoder(VOCAB_SIZE, HIDDEN_SIZE).to(DEVICE)
decoder = Decoder(HIDDEN_SIZE, VOCAB_SIZE).to(DEVICE)
seq2seq = Seq2Seq(encoder, decoder).to(DEVICE)
optimizer = torch.optim.Adam(seq2seq.parameters(), lr=LR)
criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_INDEX)

In [None]:
from time import perf_counter

def train(
    train_loader, test_loader, seq2seq,
    optimizer, criterion,
):
    for e in range(1, 500):
        print(f'Epoch: {e}')
        losses = []
        
        seq2seq.train()
        for x, y in tqdm(train_loader):
            # x, y - [batch_size, max_len]
            output = seq2seq.forward(x, y)
            
            # для лосса надо смёрджить длину батча и длину последовательности
            # output - [max_len, batch_size, vocab_size]
            output = output.transpose(0, 1)
            output = output.reshape(-1, output.shape[-1]).to(DEVICE)
            # output - [batch_size*max_len, vocab_size]
            y = y.reshape(-1).to(DEVICE)
            # y - [batch_size*max_len]
            loss = criterion(output, y)
            
            torch.nn.utils.clip_grad_norm_(seq2seq.parameters(), max_norm=1)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            losses.append(loss.item())
        try:
            torch.save(seq2seq.state_dict(), f'epoch_{e}_seq2seq.pth')
            torch.save(encoder.state_dict(), f'epoch_{e}_encoder.pth')
            torch.save(decoder.state_dict(), f'epoch_{e}_decoder.pth')
        except:
            pass
        print(np.mean(losses[-100:]))
    return losses
    

In [None]:
losses = train(train_loader, validation_loader, seq2seq, optimizer, criterion)

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 1


100%|██████████| 2/2 [00:04<00:00,  2.07s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

10.210196018218994
Epoch: 2


100%|██████████| 2/2 [00:03<00:00,  1.99s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

7.698563814163208
Epoch: 3


100%|██████████| 2/2 [00:04<00:00,  2.02s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

6.545029401779175
Epoch: 4


100%|██████████| 2/2 [00:04<00:00,  2.15s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

6.534493923187256
Epoch: 5


100%|██████████| 2/2 [00:04<00:00,  2.15s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

6.358936548233032
Epoch: 6


100%|██████████| 2/2 [00:04<00:00,  2.03s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

6.18862771987915
Epoch: 7


100%|██████████| 2/2 [00:03<00:00,  1.97s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

6.094650983810425
Epoch: 8


100%|██████████| 2/2 [00:03<00:00,  1.92s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

5.893147230148315
Epoch: 9


100%|██████████| 2/2 [00:04<00:00,  2.03s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

5.84557843208313
Epoch: 10


100%|██████████| 2/2 [00:04<00:00,  2.27s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

5.812119483947754
Epoch: 11


100%|██████████| 2/2 [00:04<00:00,  2.27s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

5.418275594711304
Epoch: 12


100%|██████████| 2/2 [00:04<00:00,  2.16s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

5.238197326660156
Epoch: 13


100%|██████████| 2/2 [00:04<00:00,  2.18s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

5.043272256851196
Epoch: 14


100%|██████████| 2/2 [00:04<00:00,  2.30s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

4.912418842315674
Epoch: 15


100%|██████████| 2/2 [00:04<00:00,  2.23s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

4.679694175720215
Epoch: 16


100%|██████████| 2/2 [00:04<00:00,  2.34s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

4.5024895668029785
Epoch: 17


100%|██████████| 2/2 [00:04<00:00,  2.18s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

4.451125860214233
Epoch: 18


100%|██████████| 2/2 [00:04<00:00,  2.46s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

4.3045947551727295
Epoch: 19


100%|██████████| 2/2 [00:04<00:00,  2.23s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

4.190587759017944
Epoch: 20


100%|██████████| 2/2 [00:04<00:00,  2.26s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

4.122583389282227
Epoch: 21


100%|██████████| 2/2 [00:04<00:00,  2.32s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

4.047983407974243
Epoch: 22


100%|██████████| 2/2 [00:04<00:00,  2.14s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

3.9721044301986694
Epoch: 23


100%|██████████| 2/2 [00:04<00:00,  2.10s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

3.9638174772262573
Epoch: 24


100%|██████████| 2/2 [00:04<00:00,  2.14s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

3.943518877029419
Epoch: 25


100%|██████████| 2/2 [00:04<00:00,  2.06s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

3.9046096801757812
Epoch: 26


100%|██████████| 2/2 [00:04<00:00,  2.18s/it]


In [32]:
losses

2

In [101]:
del seq2seq

In [102]:
del encoder

In [103]:
del decoder

In [104]:
# почему я не люблю ноутбуки
import gc; gc.collect()

12792

In [105]:
# не помогает, чёрт его возми! не весь мусор удаляет
torch.cuda.empty_cache()

In [54]:
x = torch.FloatTensor([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
torch.softmax(x, 1)

tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652]])

In [126]:
import torch
x = torch.zeros(MAX_LEN, BATCH_SIZE)
print(x.shape)
x = x.reshape(-1)
print(x.shape)

torch.Size([32, 64])
torch.Size([2048])
