# Анализ модели LSTM

## Предварительная работа

### Библиотеки

In [1]:
!pip install nerus




In [1]:
import io
import requests
import torch

import matplotlib.pyplot as plt
import numpy as np

from copy                    import deepcopy
from matplotlib.image        import imread
from mpl_toolkits            import mplot3d
from matplotlib              import gridspec
from nerus                   import load_nerus
from skimage.segmentation    import mark_boundaries
from sklearn.metrics         import classification_report
from sklearn.model_selection import ParameterGrid
from torch.utils             import data
from torch.utils.tensorboard import SummaryWriter
from torchvision             import datasets, transforms
from tqdm.autonotebook       import tqdm
from PIL                     import Image
from urllib.request          import urlopen


  from tqdm.autonotebook       import tqdm


In [3]:
import warnings
warnings.filterwarnings("ignore")


### Установка вычислительного устройства

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

### Обучение

In [5]:
def data_loader(dataset, word2idx, tag2idx, batch_size=64, shuffle=True):
    X, Y = dataset[0], dataset[1]

    PAD = word2idx['<PAD>']
    n_samples = len(X)

# генерим список индексов
    list_of_indexes = np.linspace(
        0, n_samples - 1, n_samples, dtype=np.int64)
    List_X = []
    List_Y = []

# если нужно перемешать, то перемешиваем
    if shuffle:
        np.random.shuffle(list_of_indexes)

# сгенерируем список индексов, по этим индексам,
# сделаем новый перемешаный спиисок токенов и тэгов
    for indx in list_of_indexes:
        List_X.append(X[indx])
        List_Y.append(Y[indx])

    n_batches = n_samples//batch_size
    if n_samples%batch_size != 0:
        n_batches+=1

    # For each k yield pair x and y
    for k in range(n_batches):
# указываем текущии размер батча
        this_batch_size = batch_size

# если мы выдаем последний батч, то его нужно обрезать
        if k == n_batches - 1:
            if n_samples%batch_size > 0:
                this_batch_size = n_samples%batch_size

        This_X = List_X[k*batch_size:k*batch_size + this_batch_size]
        This_Y = List_Y[k*batch_size:k*batch_size + this_batch_size]

        This_X_line = [
                       [word2idx.get('<START>', 0)] \
                       + [word2idx.get(word, 1) for word in sent] \
                       + [word2idx.get('<FINISH>', 0)]\
                       for sent in This_X]
        This_Y_line = [
                       [tag2idx.get('<START>', 0)]\
                       + [tag2idx.get(tag, 1) for tag in sent]\
                       + [tag2idx.get('<FINISH>', 0)]\
                       for sent in This_Y]

        List_of_length_x = [len(sent) for sent in This_X_line]
        length_of_sentence_x = max(List_of_length_x)
        List_of_length_y = [len(sent) for sent in This_Y_line]
        length_of_sentence_y = max(List_of_length_y)

        x_arr = np.ones(shape=[this_batch_size, length_of_sentence_x])*PAD
        y_arr = np.ones(shape=[this_batch_size, length_of_sentence_y])*PAD

        for i in range(this_batch_size):
            x_arr[i, :len(This_X_line[i])] = This_X_line[i]
            y_arr[i, :len(This_Y_line[i])] = This_Y_line[i]

        x = torch.LongTensor(x_arr)
        y = torch.LongTensor(y_arr)
        lengths = torch.LongTensor(List_of_length_x)

        yield x, y


In [6]:
def train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function):
    model.train()
    model.zero_grad()

    output = model(batch_of_x.to(model.device)).transpose(1, 2)

    loss = loss_function(output, batch_of_y.to(model.device))

    loss.backward()
    optimizer.step()

    return loss.cpu().item()


In [7]:
def train_epoch(train_generator,
                model,
                loss_function,
                optimizer,
                callback = None):

    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model,
                                    batch_of_x,
                                    batch_of_y,
                                    optimizer,
                                    loss_function)

        train_generator.set_postfix({'train batch loss': batch_loss})

        if callback is not None:
            callback(model, batch_loss)

        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss/total


In [8]:
def trainer(count_of_epoch,
            batch_size,
            model,
            dataset,
            word2idx,
            tag2idx,
            loss_function,
            optimizer,
            callback):
    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})

    optima = optimizer

    n_samples = len(dataset[0])
    number_of_batch = n_samples//batch_size + (n_samples%batch_size>0)

    for it in iterations:
        generator = tqdm(
            data_loader(dataset, word2idx, tag2idx, batch_size),
            leave=False, total=number_of_batch)

        epoch_loss = train_epoch(
            train_generator = generator,
            model = model,
            loss_function = loss_function,
            optimizer = optima,
            callback=callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})


## Отслеживание обучения модели

In [3]:
%load_ext tensorboard
%tensorboard --logdir experiment/


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 15463), started 0:00:22 ago. (Use '!kill 15463' to kill it.)

In [10]:
class callback():
    def __init__(self, writer, dataset, loss_function,
                 word2idx, tag2idx, delimeter = 100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.batch_size = batch_size

        self.dataset = dataset

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)

        if self.step % self.delimeter == 0:

            batch_generator = data_loader(dataset = self.dataset,
                                          batch_size = self.batch_size,
                                          word2idx = self.word2idx,
                                          tag2idx = self.tag2idx)
            pred = []
            real = []
            test_loss = 0
            for it, (x_batch, y_batch) in enumerate(batch_generator):
                x_batch = x_batch.to(model.device)
                y_batch = y_batch.to(model.device)

                output = model(x_batch)

                pred.extend(torch.argmax(output, dim=2).cpu().numpy().flatten().tolist())
                real.extend(y_batch.cpu().numpy().flatten().tolist())

                output = output.transpose(1, 2)
                test_loss += self.loss_function(output, y_batch).cpu().item()*len(x_batch)

            test_loss /= len(self.dataset[0])

            self.writer.add_scalar('LOSS/test', test_loss, self.step)

            self.writer.add_text('REPORT/test', str(classification_report(real, pred)), self.step)

    def __call__(self, model, loss):
        return self.forward(model, loss)


## Модель нейросети

In [11]:
class LSTMTagger(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self,
                 vocab_dim,
                 tagger_dim,
                 is_batch_norm = False,
                 drop_out = 0,
                 emb_dim = 10,
                 hidden_dim = 10,
                 num_layers = 3):
        super(LSTMTagger, self).__init__()

        self.drop_out = drop_out
        self.is_batch_norm = is_batch_norm
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim

        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)

        self.batch_norm_inp = torch.nn.BatchNorm1d(emb_dim)

        self.lstm = torch.nn.LSTM(emb_dim, hidden_dim, num_layers,
                                  dropout = drop_out, batch_first=True)

        self.linear = torch.nn.Linear(hidden_dim, tagger_dim)

    def forward(self, input):
        input = self.embedding(input)

        if self.is_batch_norm:
            input = input.transpose(1, 2)
            norm = self.batch_norm_inp(input)
            norm = norm.transpose(1, 2)
            d, (_, _) = self.lstm(norm)
        else:
            d, (_, _) = self.lstm(input)

        return self.linear(d)


## Выборка

In [12]:
docs = load_nerus('nerus_lenta.conllu.gz')

train_size = 4200
test_size = 800

train_tokens = []
train_tags = []

test_tokens = []
test_tags = []

for _ in range(train_size):
    doc = next(docs)
    for sent in doc.sents:
        train_tokens.append([])
        train_tags.append([])
        for nerus_token in sent.tokens:
            train_tokens[-1].append(nerus_token.text)
            train_tags[-1].append(nerus_token.tag)


for _ in range(test_size):
    doc = next(docs)
    for sent in doc.sents:
        test_tokens.append([])
        test_tags.append([])
        for nerus_token in sent.tokens:
            test_tokens[-1].append(nerus_token.text)
            test_tags[-1].append(nerus_token.tag)

dataset_train = [train_tokens, train_tags]
dataset_test = [test_tokens, test_tags]


In [13]:
word2idx = {'<PAD>': 0, '<UNK>': 1, '<START>': 2, '<FINISH>': 3}
idx2word = {0: '<PAD>', 1: '<UNK>', 2: '<START>', 3: '<FINISH>'}

tag2idx = {'<PAD>': 0, '<UNK>': 1, '<START>': 2, '<FINISH>': 3}
idx2tag = {0: '<PAD>', 1: '<UNK>', 2: '<START>', 3: '<FINISH>'}

tokens = [token for sent in train_tokens for token in sent]
tags = [tag for sent in train_tags for tag in sent]

for item in list(set(tokens)):
    word2idx[item] = len(word2idx)
    idx2word[word2idx[item]] = item

for item in list(set(tags)):
    tag2idx[item] = len(tag2idx)
    idx2tag[tag2idx[item]] = item


## Обучение

In [14]:
grid = ParameterGrid({'hidden_dim': [100, 200],
                      'emb_dim': [100, 200],
                      'num_layers': [2, 3],
                      'is_batch_norm': [False, True],
                      'drop_out': [0, 0.5],
                      'vocab_dim': [len(word2idx)],
                      'tagger_dim': [len(tag2idx)]})

for item in tqdm(grid):
    model = LSTMTagger(**item)

    model.to(device)

    name = 'experiment/hidden{}_emb{}_layers{}_vocab{}'.format(
            item['hidden_dim'], item['emb_dim'], item['num_layers'], item['vocab_dim'])

    if item['is_batch_norm']:
        name += '_norm'

    if item['drop_out'] > 0:
        name += 'drop{}'.format(item['drop_out'])

    writer = SummaryWriter(log_dir = name)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=word2idx['<PAD>'])

    call = callback(writer, dataset_test, loss_function, word2idx, tag2idx, delimeter = 10)

    trainer(count_of_epoch = 5,
            batch_size = 64,
            model = model,
            dataset = dataset_train,
            word2idx = word2idx,
            tag2idx = tag2idx,
            loss_function = loss_function,
            optimizer = optimizer,
            callback = call)


  0%|          | 0/32 [00:00<?, ?it/s]

## Результаты

В данной работе была изучена работа RNN сетей, на примере LSTM и датасета Nerus.

Размер словаря очень сильно опредяет обучающую способность модели, поэтому его размер не менялся, так как результат был предсказуемым.

Наихудшие результаты показали следующие модели: hidden100_emb100_layers2_vocab91394, hidden200_emb200_layers2_vocab91394, hidden100_emb200_layers2_vocab91394. Заметно, что модель стала переобучаться.

Лучший результат показала следующая модель: hidden200_emb200_layers2_vocab91394_norm.

Наличие нормировки сказалось не только на быстроту сходимости модели, но и на отсутствии переобучения. Выбрасывания нейронов также приводило к улучшению результата. Для более лучшего результата размерность скрытых слоев должна быть не больше embedding размерности. Количество слоёв LSTM почти не отражалось на результате.