In [None]:
!pip install datasets nltk

# Encoder and Decoder


Всем привет! В этом занятии мы обсудим две главные архитектуры в текстовых данных Encoder и Decoder

In [None]:
import torch
import nltk
import torchvision
import datasets


nltk.download('omw-1.4')
nltk.download('wordnet')

## Encoder. Task: Part-of-Speach

In [None]:
# example of dataset (https://huggingface.co/datasets/conll2003)

In [None]:
conll_dataset = datasets.load_dataset("conll2003")

In [None]:
conll_dataset

In [None]:
conll_dataset["train"]

In [None]:
all_words = {token for sentence in conll_dataset["train"]["tokens"] for token in sentence}
all_pos_tags = {tags for sentence_pos in conll_dataset["train"]["pos_tags"] for tags in sentence_pos}

In [None]:
len(all_words), len(all_pos_tags)

In [None]:
# example with nltk (https://www.nltk.org/book/ch05.html)
nltk.download('averaged_perceptron_tagger')

In [None]:
tokenizer = nltk.tokenize.WordPunctTokenizer()

In [None]:
text = "I am fine..."
nltk.pos_tag(tokenizer.tokenize(text))

In [None]:
nltk.pos_tag(conll_dataset["validation"]["tokens"][0])

In [None]:
conll_dataset["validation"]["pos_tags"][0]

In [None]:
# create Dataset and Dataloader out of conll

In [None]:
class ConllDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, all_words):
        self.dataset = dataset
        self.tokenizer = {
            w: idx + 16
            for idx, w in enumerate(all_words)
        }

    def __getitem__(self, idx):
        tokens = [self.tokenizer[w] for w in self.dataset["tokens"][idx]]
        tags = self.dataset["pos_tags"][idx]
        return tokens, tags

    def __len__(self):
        return len(self.dataset["tokens"])

In [None]:
train_dataset = ConllDataset(conll_dataset["train"][:128], all_words) # only 128 datapoints 

In [None]:
train_dataset[0], train_dataset[1]

In [None]:
torch.tensor([[4993, 10341, 8947, 20324, 14005, 4188, 15844, 12482, 15792], [20263, 11688]])

In [None]:
def collate_fn(items):
    tokens = [torch.tensor(i[0]) for i in items]
    tags = [torch.tensor(i[1]) for i in items]
    packed_tokens = torch.nn.utils.rnn.pack_sequence(tokens, enforce_sorted=False)
    packed_tags = torch.nn.utils.rnn.pack_sequence(tags, enforce_sorted=False)
    return packed_tokens, packed_tags

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=4,
    collate_fn=collate_fn,
)

In [None]:
packed_tokens, packed_tags = next(iter(train_dataloader))

In [None]:
packed_tokens

In [None]:
# 3> 18481 13798
# 0> 4993 10341
# 1> 20263 11688
# 2> 7471 11511

In [None]:
pad_packed_tokens = torch.nn.utils.rnn.pad_packed_sequence(packed_tokens)

In [None]:
emb_pad_packed_tokens = pad_packed_tokens[0].reshape(-1, 4, 1)
emb_pad_packed_tokens

In [None]:
torch.nn.utils.rnn.pack_padded_sequence(emb_pad_packed_tokens, pad_packed_tokens[1], enforce_sorted=False)

In [None]:
# try to code it on RNN

In [None]:
class RNN4POS(torch.nn.Module):
    def __init__(self, num_words, num_tags, hidden_size: int = 64, num_layers: int = 1, dropout: float = 0.1):
        super().__init__()

        self.embeddings = torch.nn.Embedding(num_words, hidden_size)
        self.rnn = torch.nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout)
        self.cls_head = torch.nn.Linear(hidden_size, num_tags)

    def forward(self, input_ids, length_inputs):
        """
        Args:
            input_ids: Torch.Tensor, shape: (seq_length, batch_size, hidden_size)

        To understand inputs for this module, please check rnn_padding:
            - https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html
            - https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_padded_sequence.html
        """
        embs = self.embeddings(input_ids)

        packed_sequences = torch.nn.utils.rnn.pack_padded_sequence(embs, length_inputs, enforce_sorted=False)
        rnn_outputs, _ = self.rnn(packed_sequences)
        unpacked_sequences, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_outputs)

        return self.cls_head(unpacked_sequences)

In [None]:
model = RNN4POS(len(all_words), len(all_pos_tags))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)

In [None]:
for idx, batch in enumerate(train_dataloader):
    inputs, labels = batch
    inputs, inputs_length = torch.nn.utils.rnn.pad_packed_sequence(inputs)
    labels, _ = torch.nn.utils.rnn.pad_packed_sequence(labels)
    optimizer.zero_grad()
    outputs = model(inputs, inputs_length)
    num_classes = outputs.size(-1)
    loss = criterion(outputs.view(-1, num_classes), labels.view(-1))
    optimizer.step()
    print(f"{idx:>10}: {loss:.3f}")

In [None]:
# delete everythin and run torch.cuda.empty_cache()

In [None]:
del model
del optimizer
del loss
del train_dataloader
del train_dataset
del batch
del inputs
del labels
del loss

In [None]:
torch.cuda.empty_cache()

## Decoder. Text Generation

In [None]:
# look at dataset (https://huggingface.co/datasets/rotten_tomatoes)

In [None]:
rt_dataset = datasets.load_dataset("rotten_tomatoes")

In [None]:
rt_dataset

In [None]:
rt_dataset["train"]["text"][0]

In [None]:
all_words = {w for s in rt_dataset["train"]["text"] for w in s.split(" ")}

In [None]:
# BOS > 1
# EOS > 2
# pad

In [None]:
# create torch dataset

In [None]:
class RTDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, all_words):
        self.dataset = dataset
        self.tokenizer = {
            w: idx + 16
            for idx, w in enumerate(all_words)
        }

    def __getitem__(self, idx):
        # better convert in tensor
        tokens = [1] + [self.tokenizer[w] for w in self.dataset["text"][idx].split(" ")] + [2]
        return torch.tensor(tokens)

    def __len__(self):
        return len(self.dataset["text"])

In [None]:
train_dataset = RTDataset(rt_dataset["train"], all_words) # only 128 datapoints 

In [None]:
def collate_fn(items):
    packed_tokens = torch.nn.utils.rnn.pack_sequence(items, enforce_sorted=False)
    return packed_tokens


train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=4, collate_fn=collate_fn)

In [None]:
next(iter(train_dataloader))

In [None]:
# write down rnn model

In [None]:
class RNN4GEN(torch.nn.Module):
    def __init__(self, num_words, hidden_size: int = 256, num_layers: int = 6, dropout: float = 0.1):
        super().__init__()

        self.embeddings = torch.nn.Embedding(num_words, hidden_size)
        self.rnn = torch.nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout)

    def forward(self, input_ids, length_inputs):
        """
        Args:
            input_ids: Torch.Tensor, shape: (seq_length, batch_size)

        To understand inputs for this module, please check rnn_padding:
            - https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html
            - https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_padded_sequence.html
        """
        embs = self.embeddings(input_ids)

        packed_sequences = torch.nn.utils.rnn.pack_padded_sequence(embs, length_inputs, enforce_sorted=False)
        rnn_outputs, _ = self.rnn(packed_sequences)
        unpacked_sequences, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_outputs)

        # unpacked_sequences (seq_length, batch_size, hidden_size)
        # self.embeddings.weight (num_words, hidden_size)
        # want output: (seq_length, batch_size, num_words)

        return unpacked_sequences @ self.embeddings.weight.T

    @torch.no_grad()
    def predict_one_token(self, input_ids):
        """
        Args:
            input_ids: Torch.Tensor, shape: (seq_length, 1)
        """
        embs = self.embeddings(input_ids)
        rnn_outputs, _ = self.rnn(embs)
        return rnn_outputs[-1] @ self.embeddings.weight.T

In [None]:
model = RNN4GEN(len(all_words) + 16)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)

In [None]:
output = model.predict_one_token(torch.tensor([1, 10]))
output.size()

In [None]:
len(all_words) + 16

In [None]:
torch.softmax(output, dim=0)

In [None]:
for idx, batch in enumerate(train_dataloader):
    inputs_ = batch
    inputs_, inputs_length_ = torch.nn.utils.rnn.pad_packed_sequence(inputs_)
    inputs, labels = inputs_[:-1], inputs_[1:]
    optimizer.zero_grad()
    outputs = model(inputs, inputs_length_ - 1)
    loss = criterion(outputs.view(-1, len(all_words) + 16), labels.view(-1))
    optimizer.step()
    print(f"{idx:>10}: {loss:.3f}")

In [None]:
# test it with different generation strategies

In [None]:
def greed_search(model):
    input = torch.tensor([1])
    while input.size(0) < 16:
        output = model.predict_one_token(input)
        max_token = torch.max(output, dim=0)[1]
        input = torch.concat([input, max_token.reshape(1)])
        if max_token == 2:
            break
    return input

In [None]:
greed_search(model)

In [None]:
detokenizer = {idx: w for w, idx in train_dataset.tokenizer.items()}

In [None]:
[detokenizer.get(i, "") for i in [    1,  5407,  6330,  3685,  4603, 16354,  3314, 18948,  1396,  4150,
         3761, 12143, 12163,  4249, 14375, 14148]]

In [None]:
torch.multinomial(torch.tensor([0.1, 0.3, 0.6]), 1)

In [None]:
def top_k_sample(model):
    input = torch.tensor([1])
    while input.size(0) < 16:
        output = model.predict_one_token(input)
        probs = torch.softmax(output, dim=0)
        token = torch.multinomial(probs, 1)
        input = torch.concat([input, token])
        if token == 2:
            break
    return input

In [None]:
top_k_sample(model)

In [None]:
[detokenizer.get(i, "") for i in [    1,    60, 18108,  3954,  3764,  4819, 14263, 18427, 14874,   995,
        12968, 12086,  3397,  9754, 16320, 10100]]

In [None]:
# check generation