### Libraries

In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt
from matplotlib import gridspec
from tqdm import tqdm
import numpy as np

from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets import FashionMNIST
from torchvision import datasets, transforms
import torchvision
import torch

In [None]:
!unzip experiment.zip -d .
%load_ext tensorboard
%tensorboard --logdir ./experiment/

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

### Dataset & general parameters

In [None]:
# Uncomment string below to download dataset
# !wget https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz

In [None]:
from nerus import load_nerus
docs = load_nerus('nerus_lenta.conllu.gz')
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['<PAD>']

sentences = []
tags = []

cnt = 0
n_docs_max = 5000

for doc in tqdm(docs):
    cnt += 1
    for sent in doc.sents:
        sent_ = []
        tag_ = []

        for word in sent.tokens:
            tag_.append(word.pos)
            sent_.append(word.text)

        sentences.append(sent_)
        tags.append(tag_)

    if cnt > n_docs_max:
        break

In [None]:
set_tokens = {word for sent in sentences for word in sent}
set_tokens.difference_update(special_tokens)
list_tokens = special_tokens + list(set_tokens)

set_tags = {tag for t in tags for tag in t}
set_tags.difference_update(special_tags)
list_tags = special_tags + list(set_tags)

token_to_idx = dict(zip(list_tokens, np.arange(len(list_tokens))))
tag_to_idx = dict(zip(list_tags, np.arange(len(list_tags))))

In [None]:
train_test_boundary = int(len(sentences) * 0.9)

train_sentences = sentences[:train_test_boundary]
train_tags = tags[:train_test_boundary]
test_sentences = sentences[train_test_boundary:]
test_tags = tags[train_test_boundary:]

print(len(train_sentences), len(test_sentences))

In [None]:
class TaggingDataset(torch.utils.data.Dataset):
    def __init__(self, sentences, tags, token_to_idx, tag_to_idx):
        super().__init__()

        self.sentences = sentences
        self.tags = tags
        self.token_to_idx = token_to_idx
        self.tag_to_idx = tag_to_idx

        sent_index = []
        tags_index = []

        for sent in sentences:
            sequence = []

            for token in sent:
                if token in self.token_to_idx:
                    sequence.append(token_to_idx[token])
                else:
                    sequence.append(0)

            sent_index.append(sequence)

        for sent_tags in tags:
            tgs = []

            for tag in sent_tags:
                tgs.append(tag_to_idx[tag])

            tags_index.append(tgs)

        self.sent_index = sent_index
        self.tags_index = tags_index

    def __getitem__(self, idx):
        return torch.tensor(self.sent_index[idx]), torch.tensor(self.tags_index[idx])

    def __len__(self):
        return len(self.sent_index)

In [None]:
train_dataset = TaggingDataset(train_sentences, train_tags, token_to_idx, tag_to_idx)
test_dataset = TaggingDataset(test_sentences, test_tags, token_to_idx, tag_to_idx)

In [None]:
class PaddingCollator:
    def __init__(self, pad_token_id, pad_tag_id):
        self.pad_token_idx = pad_token_id
        self.pad_tag_id = pad_tag_id

    def __call__(self, batch):

        max_len = 0
        for elem in batch:
            if (len(elem[0]) > max_len):
                max_len = len(elem[0])

        new_sentences = []
        new_tags = []

        for elem in batch:
            new_sentences.append(torch.nn.functional.pad(elem[0], (0, max_len - len(elem[0])), "constant", self.pad_token_idx))
            new_tags.append(torch.nn.functional.pad(elem[1], (0, max_len - len(elem[1])), "constant", self.pad_tag_id))

        return torch.stack(new_sentences), torch.stack(new_tags)

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=False,
    drop_last=False,
    collate_fn=PaddingCollator(
        pad_token_id=token_to_idx['<PAD>'],
        pad_tag_id=tag_to_idx['<PAD>'],
    )
)

test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    drop_last=False,
    collate_fn=PaddingCollator(
        pad_token_id=token_to_idx['<PAD>'],
        pad_tag_id=tag_to_idx['<PAD>'],
    )
)

### General training code

In [None]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    model.zero_grad()

    x_batch = x_batch.to(device)
    y_batch = y_batch.to(device)

    output = model(x_batch)
    output = torch.transpose(output, 1, 2)
    loss = loss_function(output, y_batch)

    loss.backward()
    optimizer.step()

    return loss.cpu().item()

In [None]:
def train_epoch(train_generator, model, loss_function, optimizer, callback):
    epoch_loss = 0
    total = 0

    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(
            model, batch_of_x.to(device), batch_of_y.to(device), optimizer, loss_function)

        if callback is not None:
            callback(model, batch_loss)

        epoch_loss += batch_loss * len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss / total

In [None]:
def trainer(count_of_epoch,
            model,
            dataset_loader,
            loss_function,
            optimizer,
            lr=0.001,
            callback=None):
    optima = optimizer(model.parameters(), lr=lr, weight_decay=1e-5)

    iterations = tqdm(range(count_of_epoch))

    for it in iterations:
        epoch_loss = train_epoch(
            train_generator=dataset_loader, model=model,
            loss_function=loss_function,
            optimizer=optima,
            callback=callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})

In [None]:
def quality_of_train(dataset_loader,
                     model,
                     loss_function):
    pred = []
    real = []
    test_loss = 0
    total = 0

    for it, (sentences, tags) in enumerate(dataset_loader):
        sentences = sentences.to(device)
        tags = tags.to(device)

        output = model(sentences)
        output = torch.transpose(output, 1, 2)

        test_loss += loss_function(output, tags).cpu().item() * len(sentences)
        total += len(sentences)

        pred.extend(torch.argmax(output, dim=2).cpu().numpy().flatten().tolist())
        real.extend(tags.cpu().numpy().flatten().tolist())

    test_loss /= total

    return test_loss, pred, real

### LSTM model

In [None]:
class LSTMTagger(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, dropout=0.0):
        super(LSTMTagger, self).__init__()

        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.linear = torch.nn.Linear(hidden_dim, tagset_size)

    def forward(self, x_batch):
        embeddings = self.word_embeddings(x_batch)

        d_n, (h_n, c_n) = self.lstm(embeddings)
        return self.linear(d_n)

### Tensorboard training tracking

In [None]:
class callback():
    def __init__(self, writer, dataset_loader, loss_function, delimeter=100):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function

        self.dataset_loader = dataset_loader

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)

        if self.step % self.delimeter == 0:
            test_loss, pred, real = quality_of_train(dataset_loader=self.dataset_loader,
                                                     model=model, loss_function=self.loss_function)
            self.writer.add_scalar('LOSS/test', test_loss, self.step)

    def __call__(self, model, loss):
        return self.forward(model, loss)

### LSTM training

In [None]:
loss_function = torch.nn.CrossEntropyLoss(ignore_index = tag_to_idx['<PAD>'])
optimizer = torch.optim.Adam

In [None]:
grid = ParameterGrid({
    'embedding_dim': [150, 290, 430, 600],
    'hidden_dim': [150, 290, 430, 600],
    'dropout': [0.0, 0.18, 0.36, 0.54],
})

scores = dict()

for item in tqdm(grid):
    print(str(item))

    model = LSTMTagger(
        embedding_dim=item['embedding_dim'],
        hidden_dim=item['hidden_dim'],
        vocab_size=len(token_to_idx),
        tagset_size=len(tag_to_idx),
        dropout=item['dropout']
    )

    writer = SummaryWriter('experiment/' + str(item))

    model.float().to(device)

    call = callback(writer, test_dataloader, loss_function, delimeter=10)

    trainer(count_of_epoch=2,
        dataset_loader=train_dataloader,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        lr=0.001,
        callback=call)