# Анализ модели автокодировщика

## Предварительная работа

### Библиотеки

In [6]:
!pip install nltk




In [7]:
import io
import requests
import torch

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from copy                    import deepcopy
from matplotlib.image        import imread
from mpl_toolkits            import mplot3d
from matplotlib              import gridspec
from nerus                   import load_nerus
from nltk.tokenize           import RegexpTokenizer
from skimage.segmentation    import mark_boundaries
from sklearn.metrics         import classification_report
from sklearn.model_selection import ParameterGrid
from torch.utils             import data
from torch.utils.tensorboard import SummaryWriter
from torchvision             import datasets, transforms
from tqdm.autonotebook       import tqdm
from PIL                     import Image
from urllib.request          import urlopen


In [8]:
import warnings
warnings.filterwarnings("ignore")


### Установка вычислительного устройства

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

## Обучение

In [10]:
def data_loader(dataset, word2idx, batch_size=64, shuffle=True):
    X = dataset

    PAD = word2idx['[PAD]']
    n_samples = len(X)

# генерим список индексов
    list_of_indexes = np.linspace(
        0, n_samples - 1, n_samples, dtype=np.int64)
    List_X = []

# если нужно перемешать, то перемешиваем
    if shuffle:
        np.random.shuffle(list_of_indexes)

# сгенерируем список индексов, по этим индексам,
# сделаем новый перемешаный спиисок токенов
    for indx in list_of_indexes:
        List_X.append(X[indx])

    n_batches = n_samples//batch_size
    if n_samples%batch_size != 0:
        n_batches+=1

    # For each k yield pair x and y
    for k in range(n_batches):
# указываем текущии размер батча
        this_batch_size = batch_size

# если мы выдаем последний батч, то его нужно обрезать
        if k == n_batches - 1:
            if n_samples%batch_size > 0:
                this_batch_size = n_samples%batch_size

        This_X = List_X[k*batch_size:k*batch_size + this_batch_size]

        This_X_line = [
                       [word2idx.get('[CLS]', 0)] \
                       + [word2idx.get(word, word2idx['[UNK]']) for word in sent] \
                       + [word2idx.get('[SEP]', 0)]\
                       for sent in This_X]

        List_of_length_x = [len(sent) for sent in This_X_line]
        length_of_sentence_x = max(List_of_length_x)

        x_arr = np.ones(shape=[this_batch_size, length_of_sentence_x])*PAD

        for i in range(this_batch_size):
            x_arr[i, :len(This_X_line[i])] = This_X_line[i]

        x = torch.LongTensor(x_arr)

        yield x, x


In [11]:
def train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()

    output = model(batch_of_x.to(model.device))

    loss = loss_function(output, batch_of_x.to(model.device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()


In [12]:
def train_epoch(train_generator,
                model,
                loss_function,
                optimizer,
                callback = None):

    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model,
                                    batch_of_x,
                                    batch_of_y,
                                    optimizer,
                                    loss_function)
        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)

        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss/total


In [13]:
def trainer(count_of_epoch,
            batch_size,
            model,
            dataset,
            loss_function,
            optimizer,
            word2idx,
            lr = 0.001,
            callback = None):
    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})

    n_samples = len(dataset)
    number_of_batch = n_samples//batch_size + (n_samples%batch_size>0)

    for it in iterations:
        batch_generator = tqdm(
            data_loader(dataset = dataset, batch_size = batch_size,
                        word2idx = word2idx, shuffle=True),
            leave=False, total=number_of_batch)

        epoch_loss = train_epoch(
            train_generator = batch_generator,
            model = model,
            loss_function = loss_function,
            optimizer = optimizer,
            callback=callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})


## Отслеживание обучения модели

In [2]:
%load_ext tensorboard
%tensorboard --logdir experiment/


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 17008), started 0:00:03 ago. (Use '!kill 17008' to kill it.)

In [15]:
class callback():
    def __init__(self, writer, dataset, word2idx,
                 loss_function, delimeter = 100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.word2idx = word2idx
        self.loss_function = loss_function
        self.batch_size = batch_size

        self.dataset = dataset

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)

        if self.step % self.delimeter == 0:

            batch_generator = data_loader(dataset = self.dataset, batch_size = self.batch_size,
                                          word2idx = self.word2idx)

            test_loss = 0
            model.eval()
            for it, (x_batch, _) in enumerate(batch_generator):
                x_batch = x_batch.to(model.device)

                output = model(x_batch)

                test_loss += self.loss_function(output, x_batch.to(model.device)).cpu().item()*len(x_batch)

            test_loss /= len(self.dataset)

            self.writer.add_scalar('LOSS/test', test_loss, self.step)

    def __call__(self, model, loss):
        return self.forward(model, loss)


## Модель автокодировщика

In [16]:
class Encoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self,
                 vocab_dim,
                 emb_dim = 10,
                 hidden_dim = 10,
                 num_layers = 3,
                 drop_out = 0,
                 is_batch_norm = False):
        super(Encoder, self).__init__()

        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.is_batch_norm = is_batch_norm

        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)

        self.norms  = torch.nn.ModuleList([])
        self.layers = torch.nn.ModuleList([torch.nn.Sequential() for _ in range(num_layers)])

        if num_layers == 1:
            if is_batch_norm:
                self.norms.append(torch.nn.BatchNorm1d(emb_dim))

            self.layers[0].add_module('Linear',  torch.nn.Linear(emb_dim, hidden_dim))
            self.layers[0].add_module('ReLU',    torch.nn.ReLU())
            self.layers[0].add_module('Dropout', torch.nn.Dropout1d(drop_out))
        else:
            self.norms.append(torch.nn.BatchNorm1d(emb_dim))

            self.layers[0].add_module('Linear',  torch.nn.Linear(emb_dim, 5 * (num_layers - 1) * hidden_dim))
            self.layers[0].add_module('ReLU',    torch.nn.ReLU())
            self.layers[0].add_module('Dropout', torch.nn.Dropout1d(drop_out))

            for i in range(1, num_layers - 1):
                if is_batch_norm:
                    self.norms.append(torch.nn.BatchNorm1d(5 * (num_layers - i) * hidden_dim))

                self.layers[i].add_module('Linear', torch.nn.Linear(5 * (num_layers - i    ) * hidden_dim,
                                                                    5 * (num_layers - i - 1) * hidden_dim))
                self.layers[i].add_module('ReLU',    torch.nn.ReLU())
                self.layers[i].add_module('Dropout', torch.nn.Dropout1d(drop_out))

            if is_batch_norm:
                self.norms.append(torch.nn.BatchNorm1d(5 * hidden_dim))

            self.layers[num_layers - 1].add_module('Linear',  torch.nn.Linear(5 * hidden_dim, hidden_dim))
            self.layers[num_layers - 1].add_module('ReLU',    torch.nn.ReLU())
            self.layers[num_layers - 1].add_module('Dropout', torch.nn.Dropout1d(drop_out))

    def forward(self, input):
        input = self.embedding(input)

        for i in range(self.num_layers):
            if self.is_batch_norm:
                input = self.norms[i](torch.transpose(input, 1, 2))
                input = self.layers[i](torch.transpose(input, 1, 2))
            else:
                input = self.layers[i](input)

        return input


In [17]:
class Decoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self,
                 emb_dim = 10,
                 hidden_dim = 10,
                 num_layers = 3,
                 drop_out = 0,
                 is_batch_norm = False):
        super(Decoder, self).__init__()

        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.is_batch_norm = is_batch_norm

        self.norms  = torch.nn.ModuleList([])
        self.layers = torch.nn.ModuleList([torch.nn.Sequential() for _ in range(num_layers)])

        if num_layers == 1:
            if is_batch_norm:
                self.norms.append(torch.nn.BatchNorm1d(hidden_dim))

            self.layers[0].append('Linear',  torch.nn.Linear(hidden_dim, emb_dim))
            self.layers[0].append('ReLU',    torch.nn.ReLU())
            self.layers[0].append('Dropout', torch.nn.Dropout1d(drop_out))
        else:

            if is_batch_norm:
                self.norms.append(torch.nn.BatchNorm1d(hidden_dim))

            self.layers[0].add_module('Linear',  torch.nn.Linear(hidden_dim, 5 * hidden_dim))
            self.layers[0].add_module('ReLU',    torch.nn.ReLU())
            self.layers[0].add_module('Dropout', torch.nn.Dropout1d(drop_out))

            for i in range(1, num_layers - 1):

                if is_batch_norm:
                    self.norms.append(torch.nn.BatchNorm1d(5 * i * hidden_dim))

                self.layers[i].add_module('Linear',  torch.nn.Linear(5 * (i    ) * hidden_dim,
                                                                     5 * (i + 1) * hidden_dim))
                self.layers[i].add_module('ReLU',    torch.nn.ReLU())
                self.layers[i].add_module('Dropout', torch.nn.Dropout1d(drop_out))

            if is_batch_norm:
                self.norms.append(torch.nn.BatchNorm1d(5 * (num_layers - 1) * hidden_dim))

            self.layers[num_layers - 1].add_module('Linear',  torch.nn.Linear(5 * (num_layers - 1) * hidden_dim, emb_dim))
            self.layers[num_layers - 1].add_module('ReLU',    torch.nn.ReLU())
            self.layers[num_layers - 1].add_module('Dropout', torch.nn.Dropout1d(drop_out))

    def forward(self, input):
        for i in range(self.num_layers):
            if self.is_batch_norm:
                input = self.norms[i](torch.transpose(input, 1, 2))
                input = self.layers[i](torch.transpose(input, 1, 2))
            else:
                input = self.layers[i](input)

        return input.transpose(1, 2)


In [18]:
class Autoencoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self,
                 vocab_dim,
                 emb_dim = 10,
                 hidden_dim = 10,
                 num_layers = 3,
                 drop_out = 0,
                 is_batch_norm = False):
        super(type(self), self).__init__()
        self.vocab_dim = vocab_dim
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.drop_out = drop_out
        self.is_batch_norm = is_batch_norm

        self.encoder = Encoder(vocab_dim = vocab_dim, emb_dim = emb_dim,
                               hidden_dim = hidden_dim, num_layers = num_layers,
                               drop_out = drop_out, is_batch_norm = is_batch_norm)

        self.decoder = Decoder(emb_dim = emb_dim, hidden_dim = hidden_dim, num_layers = num_layers,
                               drop_out = drop_out, is_batch_norm = is_batch_norm)

    def forward(self, x):
        return self.decode(self.encode(x))

    def encode(self, x):
        return self.encoder(x)

    def decode(self, z):
        return self.decoder(z)


## Выборка

In [19]:
dataset = pd.read_csv('twitter.csv')
dataset = dataset[dataset[['message']].notnull().all(1)][['message']]


In [20]:
dataset = dataset.sample(125000, random_state=42)
train_mask = np.random.rand(len(dataset), ) < 0.8
dataset_train = dataset[train_mask]
dataset_test = dataset[~train_mask]


In [21]:
dataset_train.sample(5, random_state=42)


Unnamed: 0,message
996705,"Mavs &amp; Caps survive, Blackhawks advance. A..."
1026737,listening to feel in love with a girl by The w...
934360,I'm not lauryn hill but I'll clap you wit that...
1064388,@mileycyrus come to the uk! WORLD TOUR
493218,@socilover yeah! Monday mornin


In [22]:
class Tokenizer(object):
    def __init__(self, word2idx, tokenizer):
        self.word2idx = word2idx
        self.tokenizer = tokenizer

    def __call__(self, sentences, max_length = 10, pad_to_max_length = False):
        tokens = self.tokenizer.tokenize_sents(sentences)

        if not pad_to_max_length:
            max_length = min(max_length, max(map(len, tokens)))

        tokens = [['[CLS]']+s+['[SEP]'] + ['[PAD]']*(max_length-len(s)) \
                  if len(s) < max_length                                \
                  else ['[CLS]']+s[:max_length]+['[SEP]']               \
                  for s in tokens ]

        ids = [[self.word2idx.get(w, self.word2idx['[UNK]']) for w in sent] for sent in tokens]

        return torch.tensor(ids)


In [23]:

word2idx = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 3, '[SEP]': 4}

for sent in dataset_train.values[:, 0]:
    for word in RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+').tokenize(sent):
        if word not in word2idx:
            word2idx[word] = len(word2idx)


In [24]:
len(word2idx)


109422

In [25]:
tokenizer = Tokenizer(word2idx, RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+'))

dataset_train = tokenizer(dataset_train.values[:, 0], pad_to_max_length=True)
dataset_test = tokenizer(dataset_test.values[:, 0], pad_to_max_length=True)


## Обучение

In [28]:
grid = ParameterGrid({'emb_dim': [200],
                      'hidden_dim': [300],
                      'num_layers': [2],
                      'is_batch_norm': [True],
                      'drop_out': [0.25],
                      'vocab_dim': [len(word2idx)]})

for item in tqdm(grid):
    model = Autoencoder(**item)
    model.to(device)

    name = 'experiment/hidden{}_emb{}_layers{}_vocab{}'.format(
            item['hidden_dim'], item['emb_dim'], item['num_layers'], item['vocab_dim'])

    if item['is_batch_norm']:
        name += '_norm'

    if item['drop_out'] > 0:
        name += 'drop{}'.format(item['drop_out'])

    writer = SummaryWriter(log_dir = name)

    optimizer = torch.optim.Adam(list(model.encoder.parameters()) +
                                 list(model.decoder.parameters()), lr = 1e-3)
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=word2idx['[PAD]'])

    call = callback(writer, dataset_test, word2idx, loss_function, delimeter = 10)

    trainer(count_of_epoch = 3,
            batch_size = 64,
            model = model,
            dataset = dataset_train,
            word2idx = word2idx,
            loss_function = loss_function,
            optimizer = optimizer,
            callback = call)


  0%|          | 0/1 [00:00<?, ?it/s]

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1561 [00:00<?, ?it/s]

  0%|          | 0/1561 [00:00<?, ?it/s]

  0%|          | 0/1561 [00:00<?, ?it/s]

## Результаты

В данной работе была изучена работа автокодировщика, на примере линейного автокодировщика и датасета, состоящий из твитов.

Размер словаря очень сильно опредяет обучающую способность модели, поэтому его размер не менялся, так как результат был предсказуемым.

Наихудшие результаты показали модели, у которых разменость скрытого слоя больше размерности embedding слоя, а также модели с малой сложностью при выбрасывании нейронов.  

Лучшие результаты показали модели с размерностью скрытого слоя меньшей размерности embedding слоя и нормировкой параметров.

Наличие нормировки сказалось не только на быстроту сходимости модели, но и на качество результатов. Выбрасывания нейронов также приводило к ухудшению результатов из-за простоты модели. Количество слоёв почти не отражалось на результате.