### Libraries

In [1]:
from sklearn.metrics import classification_report
from collections import defaultdict
from itertools import islice
from tqdm import tqdm
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [16]:
# !unzip experiment.zip -d .
%load_ext tensorboard
%tensorboard --logdir ./experiment/

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 4625), started 16:40:10 ago. (Use '!kill 4625' to kill it.)

### Dataset & general parameters

Будем использовать 300-мерные векторы `fasttext` без сжатия.

In [4]:
from nerus import load_nerus

docs = load_nerus('nerus_lenta.conllu.gz')
print(next(docs))

NerusDoc(id='0', sents=[NerusSent(id='0_0', text='Вице-премьер по социальным вопросам Татьяна Голикова рассказала, в каких регионах России зафиксирована наиболее высокая смертность от рака, сообщает РИА Новости.', tokens=[NerusToken(id='1', text='Вице-премьер', pos='NOUN', feats={'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}, head_id='7', rel='nsubj', tag='O'), NerusToken(id='2', text='по', pos='ADP', feats={}, head_id='4', rel='case', tag='O'), NerusToken(id='3', text='социальным', pos='ADJ', feats={'Case': 'Dat', 'Degree': 'Pos', 'Number': 'Plur'}, head_id='4', rel='amod', tag='O'), NerusToken(id='4', text='вопросам', pos='NOUN', feats={'Animacy': 'Inan', 'Case': 'Dat', 'Gender': 'Masc', 'Number': 'Plur'}, head_id='1', rel='nmod', tag='O'), NerusToken(id='5', text='Татьяна', pos='PROPN', feats={'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing'}, head_id='1', rel='appos', tag='B-PER'), NerusToken(id='6', text='Голикова', pos='PROPN', feats={'A

In [5]:
import fasttext
import fasttext.util

# Uncomment string below to download dataset
# fasttext.util.download_model('ru', if_exists='ignore')
ft = fasttext.load_model('cc.ru.300.bin')

In [6]:
tags = {'B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O'}
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['<PAD>']

tokens_set = {*ft.get_words()}
tokens_set.difference_update(special_tokens)
idx_to_token = special_tokens + list(tokens_set)
token_to_idx = {k: v for v, k in enumerate(idx_to_token)}
token_to_idx = defaultdict(lambda: 0, token_to_idx)

tags.difference_update(special_tags)
idx_to_tag = special_tags + list(tags)
tag_to_idx = {k: v for v, k in enumerate(idx_to_tag)}
tag_to_idx = defaultdict(lambda: 0, tag_to_idx)

In [7]:
def data_generator():
    for doc in docs:
        for s in doc.sents:
            yield zip(*[(t.text, t.tag) for t in s.tokens])

data_gen = data_generator()

test_size = 3000
train_size = 50000
test_data = islice(data_gen, test_size)
train_data = islice(data_gen, train_size)

In [8]:
# Borrowed + simplified from 1st lab (2nd task - LSTM)
class TaggingDataset(Dataset):
    def __init__(self, data, size):
        super().__init__()

        sents_tokens, sents_tags = [], []
        for tokens, tags in tqdm(data, total=size):
            tokens_idxs = list(map(token_to_idx.__getitem__, tokens))
            tags_idxs = list(map(tag_to_idx.__getitem__, tags))
            sents_tokens.append(tokens_idxs)
            sents_tags.append(tags_idxs)

        self.sents_tokens = sents_tokens
        self.sents_tags = sents_tags

    def __getitem__(self, idx):
        return (torch.tensor(self.sents_tokens[idx]),
                torch.tensor(self.sents_tags[idx]))

    def __len__(self):
        return len(self.sents_tokens)

In [9]:
def collate_fn(batch):
    sents_tokens, sents_tags = zip(*batch)
    new_sents_tokens = torch.nn.utils.rnn.pad_sequence(sents_tokens, batch_first=True, padding_value=token_to_idx['<PAD>'])
    new_sents_tags = torch.nn.utils.rnn.pad_sequence(sents_tags, batch_first=True, padding_value=tag_to_idx['<PAD>'])
    return new_sents_tokens, new_sents_tags

test_dataset = TaggingDataset(test_data, test_size)
train_dataset = TaggingDataset(train_data, train_size)

train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

100%|██████████| 3000/3000 [00:00<00:00, 6268.61it/s]
100%|██████████| 50000/50000 [00:02<00:00, 17205.93it/s]


In [10]:
# 300 is an embedding size of words in fasttext
matrix_fasttext = [np.zeros(300)] * len(special_tokens)
for word in tqdm(idx_to_token[len(special_tokens):]):
    matrix_fasttext.append(ft.get_word_vector(word))

matrix_fasttext = np.stack(matrix_fasttext)

100%|██████████| 2000000/2000000 [00:12<00:00, 156969.54it/s]


### Model description

In [11]:
class RNNClassifier(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self, matrix_fasttext, vocab_dim, output_dim, hidden_dim,
                 num_layers=3, bidirectional=False, p=0.5):
        super(RNNClassifier, self).__init__()

        embedding_dim = matrix_fasttext.shape[-1]
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_dim,
                                            embedding_dim=embedding_dim)
        self.embedding.weight.data.copy_(torch.FloatTensor(matrix_fasttext))
        for param in self.embedding.parameters():
            param.requires_grad = False

        self.encoder = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers,
                               bidirectional=bidirectional,
                               batch_first=True, dropout=p)
        self.linear = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, input):
        embedding = self.embedding(input)
        encoded, _ = self.encoder(embedding)
        return self.linear(encoded)

### Training code

In [12]:
class Trainer:
    def __init__(self, logdir, delimeter, dataloaders, modelcls, loss, lr, **modelargs):

        self.model = modelcls(**modelargs).to(device)
        self.loss = loss
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

        self.trainloader, self.testloader = dataloaders

        self.logger = SummaryWriter(logdir)
        self.delimeter = delimeter
        self.steps = 0

    def quality_of_train(self, batch):
        tokens, tags = [element.to(device) for element in batch]
        return self.loss(self.model(tokens).transpose(1, -1), tags)

    def train_step(self, batch):
        self.model.zero_grad()
        self.model.train()

        loss = self.quality_of_train(batch)

        loss.backward()
        self.optimizer.step()
        self.steps += 1

        return loss.cpu().item()

    def train_epoch(self):
        epoch_loss = 0

        for batch in tqdm(self.trainloader, leave=False):
            local_loss = self.train_step(batch)
            self.logger.add_scalar('train_loss', local_loss, self.steps)

            if self.steps % self.delimeter == 0:
                test_loss = self.test_epoch()
                self.logger.add_scalar('test_loss', test_loss, self.steps)

            epoch_loss += local_loss

        return epoch_loss/len(self.trainloader)

    def test_epoch(self, verbose=False):
        self.model.eval()

        with torch.no_grad():
            sum_loss = 0

            pred = []
            true = []

            batch_counter = 0
            n_sent_to_print = 3 # examples in tensorboard report

            for batch in tqdm(self.testloader, leave=False):
                local_loss = self.quality_of_train(batch)
                sum_loss += local_loss

                x_batch = batch[0].to(device)
                with torch.no_grad():
                    output = self.model(x_batch)

                pred_tags = torch.argmax(output, dim=-1).cpu().numpy()
                real_tags = batch[-1].cpu().numpy()

                if batch_counter == 0:
                    for i in range(n_sent_to_print):
                        n_tokens = 0
                        for j in range(len(x_batch[i])):
                            if idx_to_token[x_batch[i][j]] != '<PAD>':
                                n_tokens += 1
                            else:
                                break

                        if verbose == True:
                            print("Sent: " + str([idx_to_token[x_batch[i][j]] for j in range(n_tokens)]))
                            print("Real tags: " + str([idx_to_tag[real_tags[i][j]] for j in range(n_tokens)]))
                            print("Pred tags: " + str([idx_to_tag[pred_tags[i][j]] for j in range(n_tokens)]))
                            print("-------------------------------------------------------------------")
                        else:
                            self.logger.add_text(f'Reports/Test/example_{i}',
                                "Sent: " + str([idx_to_token[x_batch[i][j]] for j in range(n_tokens)]) + "\n" +
                                "Real tags: " + str([idx_to_tag[real_tags[i][j]] for j in range(n_tokens)]) + "\n" +
                                "Pred tags: " + str([idx_to_tag[pred_tags[i][j]] for j in range(n_tokens)]), self.steps)

                pred.extend(pred_tags.flatten())
                true.extend(real_tags.flatten())

                batch_counter += 1

            report = classification_report(true, pred, zero_division=0, labels=range(1, len(tag_to_idx)))

            if verbose:
                print(report)
            else:
                self.logger.add_text('Reports/Test', report, self.steps)

        return sum_loss / len(self.testloader)

    def train(self, n_epochs):
        for _ in tqdm(range(n_epochs)):
            self.train_epoch()

In [13]:
config = {
    'vocab_dim' : len(idx_to_token),
    'output_dim' : len(tag_to_idx),
    'matrix_fasttext' : matrix_fasttext,
    'hidden_dim' : 256,
    'num_layers' : 3,
    'p' : 0.6
}

loss_function = torch.nn.CrossEntropyLoss(ignore_index=0) # ignore padding symbols

trainer = Trainer('experiment', 100, (train_dataloader, test_dataloader),
    RNNClassifier, loss_function, lr=1e-3, **config)

Качество модели до обучения довольно прискорбное.

In [14]:
trainer.test_epoch(verbose=True)

  9%|▊         | 8/94 [00:00<00:04, 17.65it/s]

Sent: ['Австрийские', 'правоохранительные', 'органы', 'не', 'представили', 'доказательств', 'нарушения', 'российскими', 'биатлонистами', 'антидопинговых', 'правил', '.']
Real tags: ['B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-------------------------------------------------------------------
Sent: ['Об', 'этом', 'сообщил', 'посол', 'России', 'в', 'Вене', 'Дмитрий', 'Любинский', 'по', 'итогам', 'встречи', 'уполномоченного', 'адвоката', 'дипмиссии', 'с', 'представителями', 'прокуратуры', 'страны', ',', 'передает', 'ТАСС', '.']
Real tags: ['O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O']
Pred tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
-------------------------------------------------------------------
Sent: ['«', 'Действует', 'презумпц

                                                

              precision    recall  f1-score   support

           1       0.00      0.00      0.00      1237
           2       0.00      0.00      0.00      1264
           3       0.00      0.00      0.00      1097
           4       0.00      0.00      0.00       686
           5       0.40      1.00      0.57     48365
           6       0.00      0.00      0.00      1044
           7       0.00      0.00      0.00       150

   micro avg       0.40      0.90      0.55     53843
   macro avg       0.06      0.14      0.08     53843
weighted avg       0.36      0.90      0.51     53843



tensor(2.0178, device='cuda:0')

После обучения всё становится значительно лучше.

In [15]:
trainer.train(n_epochs=18)

  0%|          | 0/18 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  6%|▌         | 1/18 [00:20<05:47, 20.47s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 11%|█         | 2/18 [00:40<05:23, 20.23s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

### Conclusions

Основные наблюдения и выводы:
  * Качество модели после обучения довольно неплохое, хотя на примерах часто видны ошибки на простых словах.
  * Лосс на тестовой выборке начинает расти после 9-10 эпох (переобучение), так что ставить много эпох смысла нет.
  * Несмотря на отсутствие сжатия векторов с 300 до 100. обучение модели происходит довольно быстро (но забирает >20 Гб ОЗУ).
  * Процесс обучения ожидаем и понятен (см. tensorboard).