# Распознавания именованных сущностей на основе fasttext

## Предварительная работа

### Библиотеки

In [1]:
!pip install nerus
!pip install razdel




In [2]:
import fasttext
import fasttext.util
import io
import requests
import torch
import json

import matplotlib.pyplot as plt
import numpy as np

from collections             import defaultdict
from copy                    import deepcopy
from matplotlib.image        import imread
from mpl_toolkits            import mplot3d
from matplotlib              import gridspec
from nerus                   import load_nerus
from skimage.segmentation    import mark_boundaries
from sklearn.metrics         import classification_report
from sklearn.model_selection import ParameterGrid
from torch.utils             import data
from torch.utils.tensorboard import SummaryWriter
from torchvision             import datasets, transforms
from tqdm.autonotebook       import tqdm
from razdel                  import tokenize
from PIL                     import Image
from urllib.request          import urlopen


In [None]:
import warnings
warnings.filterwarnings("ignore")


### Установка вычислительного устройства

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

## Загрузка предобученной модели

In [None]:
# fasttext.util.download_model('ru', if_exists='ignore')
ft = fasttext.load_model('cc.ru.100.bin')
# fasttext.util.reduce_model(ft, 100)




### Обучение

In [None]:
def data_loader(dataset, word2idx, tag2idx, ft, batch_size=64, shuffle=True):
    X, Y = dataset[0], dataset[1]

    PAD = word2idx['[PAD]']
    n_samples = len(X)

# генерим список индексов
    list_of_indexes = np.linspace(
        0, n_samples - 1, n_samples, dtype=np.int64)
    List_X = []
    List_Y = []

# если нужно перемешать, то перемешиваем
    if shuffle:
        np.random.shuffle(list_of_indexes)

# сгенерируем список индексов, по этим индексам,
# сделаем новый перемешаный спиисок токенов и тэгов
    for indx in list_of_indexes:
        List_X.append(X[indx])
        List_Y.append(Y[indx])

    n_batches = n_samples//batch_size
    if n_samples%batch_size != 0:
        n_batches+=1

    # For each k yield pair x and y
    for k in range(n_batches):
# указываем текущии размер батча
        this_batch_size = batch_size

# если мы выдаем последний батч, то его нужно обрезать
        if k == n_batches - 1:
            if n_samples%batch_size > 0:
                this_batch_size = n_samples%batch_size

        This_X = List_X[k*batch_size:k*batch_size + this_batch_size]
        This_Y = List_Y[k*batch_size:k*batch_size + this_batch_size]

        This_X_line = [[ft.get_word_vector(word) for word in sent] for sent in This_X]
        This_Y_line = [[tag2idx.get(tag, 0) for tag in sent] for sent in This_Y]

        List_of_length_x = [len(sent) for sent in This_X_line]
        length_of_sentence_x = max(List_of_length_x)
        List_of_length_y = [len(sent) for sent in This_Y_line]
        length_of_sentence_y = max(List_of_length_y)

        x_arr = np.ones(shape=[this_batch_size, length_of_sentence_x, ft.get_dimension()])*PAD
        y_arr = np.ones(shape=[this_batch_size, length_of_sentence_y])*PAD

        for i in range(this_batch_size):
            x_arr[i, :len(This_X_line[i])] = This_X_line[i]
            y_arr[i, :len(This_Y_line[i])] = This_Y_line[i]

        x = torch.FloatTensor(x_arr)
        y = torch.LongTensor(y_arr)

        yield x, y


In [None]:
def train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function):
    model.train()
    model.zero_grad()

    output = model(batch_of_x.to(model.device)).transpose(1, 2)

    loss = loss_function(output, batch_of_y.to(model.device))

    loss.backward()
    optimizer.step()

    return loss.cpu().item()


In [None]:
def train_epoch(train_generator,
                model,
                loss_function,
                optimizer,
                callback = None):

    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model,
                                    batch_of_x,
                                    batch_of_y,
                                    optimizer,
                                    loss_function)

        train_generator.set_postfix({'train batch loss': batch_loss})

        if callback is not None:
            callback(model, batch_loss)

        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss/total


In [None]:
def trainer(count_of_epoch,
            batch_size,
            model,
            dataset,
            word2idx,
            tag2idx,
            loss_function,
            optimizer,
            callback):
    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})

    optima = optimizer

    n_samples = len(dataset[0])
    number_of_batch = n_samples//batch_size + (n_samples%batch_size>0)

    for it in iterations:
        generator = tqdm(
            data_loader(dataset, word2idx, tag2idx, ft, batch_size),
            leave=False, total=number_of_batch)

        epoch_loss = train_epoch(
            train_generator = generator,
            model = model,
            loss_function = loss_function,
            optimizer = optima,
            callback=callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})


## Отслеживание обучения модели

In [None]:
%load_ext tensorboard
%tensorboard --logdir experiment/


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 5163), started 0:06:37 ago. (Use '!kill 5163' to kill it.)

In [None]:
class callback():
    def __init__(self, writer, dataset, loss_function,
                 word2idx, tag2idx, idx2tag, ft, delimeter = 100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.idx2tag = idx2tag
        self.ft = ft
        self.batch_size = batch_size

        self.dataset = dataset

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)

        if self.step % self.delimeter == 0:

            batch_generator = data_loader(dataset = self.dataset,
                                          batch_size = self.batch_size,
                                          word2idx = self.word2idx,
                                          tag2idx = self.tag2idx,
                                          ft = self.ft)
            pred = []
            real = []
            test_loss = 0
            for it, (x_batch, y_batch) in enumerate(batch_generator):
                x_batch = x_batch.to(model.device)
                y_batch = y_batch.to(model.device)

                output = model(x_batch)

                pred_batch = torch.argmax(output, dim=2).cpu().numpy()
                real_batch = y_batch.cpu().numpy()

                for i in range(len(real_batch)):
                    n_tokens = 0
                    for token in real_batch[i]:
                        if token != self.tag2idx['[PAD]']:
                            n_tokens += 1
                        else:
                            break

                    pred.extend(pred_batch[i][:n_tokens])
                    real.extend(real_batch[i][:n_tokens])

                    if it == 0 and i == 0:
                        self.writer.add_text('REPORT/EXAMPLE', 'Real:' + ','.join([self.idx2tag[x] for x in pred_batch[i][:n_tokens]]) + '\n\n' +
                                                               'Pred:' + ','.join([self.idx2tag[x] for x in real_batch[i][:n_tokens]]), self.step)

                output = output.transpose(1, 2)
                test_loss += self.loss_function(output, y_batch).cpu().item()*len(x_batch)

            test_loss /= len(self.dataset[0])

            self.writer.add_scalar('LOSS/test', test_loss, self.step)

            self.writer.add_text('REPORT/test',
                    str(classification_report(real, pred, zero_division=0,
                                              target_names=[self.idx2tag[x] for x in np.unique(real + pred)])), self.step)

    def __call__(self, model, loss):
        return self.forward(model, loss)


## Модель нейросети

In [None]:
class LSTMClassifier(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self,
                 tagger_dim,
                 drop_out = 0,
                 emb_dim = 10,
                 hidden_dim = 10,
                 bidirectional = False,
                 num_layers = 3):
        super(LSTMClassifier, self).__init__()

        self.num_direction = int(bidirectional + 1)
        self.drop_out = drop_out
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim

        self.lstm = torch.nn.LSTM(emb_dim, hidden_dim, num_layers,
                                  dropout = drop_out, batch_first=True,
                                  bidirectional=bidirectional)

        self.linear = torch.nn.Linear(self.num_direction*hidden_dim, tagger_dim)

    def forward(self, input):
        d, (_, _) = self.lstm(input)

        return self.linear(d)


## Выборка


### Предобработка

In [None]:
docs = load_nerus('nerus_lenta.conllu.gz')

train_size = 4200
test_size = 800

train_tokens = []
train_tags = []

test_tokens = []
test_tags = []

for _ in range(train_size):
    doc = next(docs)
    for sent in doc.sents:
        train_tokens.append([])
        train_tags.append([])
        for nerus_token in sent.tokens:
            train_tokens[-1].append(nerus_token.text)
            train_tags[-1].append(nerus_token.tag)

for _ in range(test_size):
    doc = next(docs)
    for sent in doc.sents:
        test_tokens.append([])
        test_tags.append([])
        for nerus_token in sent.tokens:
            test_tokens[-1].append(nerus_token.text)
            test_tags[-1].append(nerus_token.tag)

dataset_train = [train_tokens, train_tags]
dataset_test = [test_tokens, test_tags]


In [None]:
word2idx = {'[PAD]': 0, '[UNK]': 1}

tag2idx = {'[PAD]': 0}
idx2tag = {0: '[PAD]'}

tokens = [token for sent in train_tokens for token in sent]
tags = [tag for sent in train_tags for tag in sent]

for item in list(set(tokens)):
    word2idx[item] = len(word2idx)

for item in list(set(tags)):
    tag2idx[item] = len(tag2idx)
    idx2tag[len(tag2idx) - 1] = item


## Обучение


In [None]:
grid = ParameterGrid({'hidden_dim': [100, 300],
                      'num_layers': [2, 3],
                      'bidirectional': [False, True],
                      'drop_out': [0, 0.3],
                      'tagger_dim': [len(tag2idx)]})

for item in tqdm(grid):
    model = LSTMClassifier(**item, emb_dim=ft.get_dimension())

    model.to(device)

    name = 'experiment/hidden{}_layers{}'.format(
            item['hidden_dim'], item['num_layers'])

    if item['bidirectional']:
        name += 'bidirectional'

    if item['drop_out'] > 0:
        name += 'drop{}'.format(item['drop_out'])

    writer = SummaryWriter(log_dir = name)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=word2idx['[PAD]'])

    call = callback(writer, dataset_test, loss_function, word2idx, tag2idx, idx2tag, ft, delimeter = 10)

    trainer(count_of_epoch = 5,
            batch_size = 64,
            model = model,
            dataset = dataset_train,
            word2idx = word2idx,
            tag2idx = tag2idx,
            loss_function = loss_function,
            optimizer = optimizer,
            callback = call)


  0%|          | 0/16 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

## Результаты

Наилучшие результаты показала архитектура с BiLSTM c 3 слоями, скрытой размерностью 300. Для эмбединг слоя использовалась предобученная модель fasttext с размерностью 100. Модель отлично находит все теги, чуть хуже справляется I-LOC, I-ORG. Среднее взвешненное значение F1-меры равняется 0.99, а среднее значение 0.94.