Обучение языковой модели с помощью LSTM (10 баллов)

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Installing collected p

Импорт необходимых библиотек

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from datasets import load_dataset
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
import nltk

from collections import Counter
from typing import List

import seaborn
seaborn.set(palette='summer')

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Подготовка данных

Воспользуемся датасетом imdb. В нем хранятся отзывы о фильмах с сайта imdb. Загрузим данные с помощью функции ```load_dataset```

In [5]:
# Загрузим датасет
dataset = load_dataset('imdb')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

### Препроцессинг данных и создание словаря


In [6]:
from tqdm import tqdm

In [7]:
sent_tokenize("rehfb hrjv, iv. wkfjv. kfm, kjr! fnlk")

['rehfb hrjv, iv.', 'wkfjv.', 'kfm, kjr!', 'fnlk']

In [8]:
sentences = []
word_threshold = 32

# Получить отдельные предложения и поместить их в sentences
for feedback in tqdm(dataset['train']['text']):
  feedback_divide = sent_tokenize(feedback)
  for sentence in feedback_divide:
    if len(word_tokenize(sentence)) < word_threshold:
      sentences.append(sentence.lower())

100%|██████████| 25000/25000 [00:58<00:00, 430.06it/s]


In [9]:
print("Всего предложений:", len(sentences))

Всего предложений: 198801


In [10]:
sentences[0]

'i rented i am curious-yellow from my video store because of all the controversy that surrounded it when it was first released in 1967.'

Посчитаем для каждого слова его встречаемость.

In [11]:
words = Counter()

# Расчет встречаемости слов
for sent in tqdm(sentences):
  sent = word_tokenize(sent)
  for word in sent:
    words[word] += 1

100%|██████████| 198801/198801 [00:28<00:00, 6896.77it/s]


Добавим в словарь ```vocab_size``` самых встречающихся слов.

In [12]:
vocab = set()
vocab_size = 40000

# Наполнение словаря
dictionary = words.most_common(vocab_size)

In [13]:
for elem in dictionary:
  vocab.add(elem[0])

vocab.add('<unk>')
vocab.add('<bos>')
vocab.add('<eos>')
vocab.add('<pad>')

In [14]:
assert '<unk>' in vocab
assert '<bos>' in vocab
assert '<eos>' in vocab
assert '<pad>' in vocab
assert len(vocab) == vocab_size + 4

In [15]:
print("Всего слов в словаре:", len(vocab))

Всего слов в словаре: 40004


### Подготовка датасета



In [16]:
word2ind = {char: i for i, char in enumerate(vocab)}
ind2word = {i: char for char, i in word2ind.items()}

In [17]:
class WordDataset:
    def __init__(self, sentences):
        self.data = sentences
        self.unk_id = word2ind['<unk>']
        self.bos_id = word2ind['<bos>']
        self.eos_id = word2ind['<eos>']
        self.pad_id = word2ind['<pad>']

    def __getitem__(self, idx: int) -> List[int]:
        tokenized_sentence = [self.bos_id]
        # Допишите код здесь
        for word in word_tokenize(self.data[idx]):
          tokenized_sentence.append(word2ind.get(word, self.unk_id))

        tokenized_sentence.append(self.eos_id)
        return tokenized_sentence

    def __len__(self) -> int:
        return len(self.data)

In [18]:
def collate_fn_with_padding(
    input_batch: List[List[int]], pad_id=word2ind['<pad>']) -> torch.Tensor:
    seq_lens = [len(x) for x in input_batch]
    max_seq_len = max(seq_lens)

    new_batch = []
    for sequence in input_batch:
        for _ in range(max_seq_len - len(sequence)):
            sequence.append(pad_id)
        new_batch.append(sequence)

    sequences = torch.LongTensor(new_batch).to(device)

    new_batch = {
        'input_ids': sequences[:,:-1],
        'target_ids': sequences[:,1:]
    }

    return new_batch

In [19]:
train_sentences, eval_sentences = train_test_split(sentences, test_size=0.2)
eval_sentences, test_sentences = train_test_split(sentences, test_size=0.5)

train_dataset = WordDataset(train_sentences)
eval_dataset = WordDataset(eval_sentences)
test_dataset = WordDataset(test_sentences)

batch_size = 128

train_dataloader = DataLoader(
    train_dataset, collate_fn=collate_fn_with_padding, batch_size=batch_size)

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=collate_fn_with_padding, batch_size=batch_size)

test_dataloader = DataLoader(
    test_dataset, collate_fn=collate_fn_with_padding, batch_size=batch_size)

## Обучение и архитектура модели



### Функция evaluate (1 балл)

Заполните функцию ```evaluate```

In [20]:
def evaluate(model, criterion, dataloader) -> float:
    model.eval()
    perplexity = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            logits = model(batch['input_ids']).flatten(start_dim=0, end_dim=1) # Посчитайте логиты предсказаний следующих слов
            loss = criterion(logits, batch['target_ids'].flatten())
            perplexity.append(torch.exp(loss).item())

    perplexity = sum(perplexity) / len(perplexity)

    return perplexity


### Train loop

In [21]:
def train_model(model, criterion, optimizer, dataloader, epochs):
    # Напишите код здесь
    loss_func = []
    perplexity = []
    model.train()
    for epoch in range(epochs):
      epoch_loss = []
      for batch in tqdm(dataloader):
        optimizer.zero_grad()
        logits = model(batch['input_ids']).flatten(start_dim=0, end_dim=1)
        loss = criterion(logits, batch['target_ids'].flatten())
        epoch_loss.append(loss)
        loss.backward()
        optimizer.step()

      loss_func.append(epoch_loss)
      perplexity.append(evaluate(model, criterion, dataloader))
      model.train()

    return loss_func


### Первый эксперимент

In [22]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, batch_size, hidden_dim):
        super().__init__()
        # Опишите свою нейронную сеть здесь
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.model = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)


    def forward(self, input_batch: torch.Tensor) -> torch.Tensor:
        # А тут опишите forward pass модели
        embeddings = self.embedding(input_batch)
        output, _ = self.model(embeddings)
        return self.linear(output)

In [23]:
model = LanguageModel(len(vocab), batch_size, 256).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=word2ind['<pad>'])
optimizer = torch.optim.Adam(model.parameters())

In [24]:
# Обучите модель здесь
loss = train_model(model, criterion, optimizer, train_dataloader, 5)

100%|██████████| 1243/1243 [02:03<00:00, 10.03it/s]
100%|██████████| 1243/1243 [01:08<00:00, 18.21it/s]
100%|██████████| 1243/1243 [02:05<00:00,  9.88it/s]
100%|██████████| 1243/1243 [01:12<00:00, 17.05it/s]
100%|██████████| 1243/1243 [02:06<00:00,  9.85it/s]
100%|██████████| 1243/1243 [01:06<00:00, 18.67it/s]
100%|██████████| 1243/1243 [02:05<00:00,  9.88it/s]
100%|██████████| 1243/1243 [01:06<00:00, 18.65it/s]
100%|██████████| 1243/1243 [02:05<00:00,  9.88it/s]
100%|██████████| 1243/1243 [01:07<00:00, 18.48it/s]


### Второй эксперимент (2 балла)

Попробуйте что-то поменять в модели или в пайплайне обучения, идеи для экспериментов можно подсмотреть выше.

### Отчет (2 балла)

Опишите проведенные эксперименты. Сравните перплексии полученных моделей. Предложите идеи по улучшению качества моделей.

In [43]:
def generate_sequence(model, starting_seq: str, max_seq_len: int = 128) -> str:
    device = 'cpu'
    model = model.to(device)
    input_ids = [word2ind['<bos>']] + [
        word2ind.get(char, word2ind['<unk>']) for char in starting_seq.split()]
    input_ids = torch.LongTensor(input_ids).to(device)

    model.eval()
    with torch.no_grad():
        for i in range(max_seq_len):
            next_char_distribution = model(input_ids)[-1]
            next_char = next_char_distribution.squeeze().argmax()
            input_ids = torch.cat([input_ids, next_char.unsqueeze(0)])

            if next_char.item() == word2ind['<eos>']:
                break

    words = ' '.join([ind2word[idx.item()] for idx in input_ids])

    return words

In [48]:
generate_sequence(model, starting_seq='i love')

i love
[329, 25026, 35047]


'<bos> i love the movie . <eos>'