### Libraries

In [1]:
from sklearn.model_selection import ParameterGrid
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
import pandas as pd
import numpy as np
import random

from torch.utils.tensorboard import SummaryWriter
import torch

In [2]:
# !unzip experiment.zip -d .
# %reload_ext tensorboard
%load_ext tensorboard
%tensorboard --logdir ./experiment/

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 19962), started 0:00:14 ago. (Use '!kill 19962' to kill it.)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Dataset & general parameters

Dataset был скачан по ссылке https://disk.yandex.ru/d/bwUVH8hR1MRNrg.

В нём предствлена выборка Twitter (эмбединги предложений).

In [4]:
root = 'topicmodeling'
dataset_path = 'twitter.csv'

f = open(root + '/' + dataset_path, "r")
dataset = pd.read_csv(f)

dataset = dataset[dataset[['tag', 'message']].notnull().all(1)]

dataset = dataset.sample(10000, random_state=42)
train_mask = np.random.rand(len(dataset)) < 0.8
dataset_train = dataset[train_mask]
dataset_test = dataset[~train_mask]

f.close()

print(len(dataset_train), len(dataset_test))

8061 1939


Заполним словарь токенов на основе train-выборки, чтобы далее можно было преобразовывать предложения в векторы. Дополнительно удобно ввести абстракцию токенизатора, чтобы не зудумываться о ручных выравниваниях.

In [5]:
class Tokenizer(object):
    def __init__(self, word2idx, tokenizer):
        self.word2idx = word2idx
        self.tokenizer = tokenizer

    def __call__(self, sentences, max_length=10, pad_to_max_length=False):
        tokens = self.tokenizer.tokenize_sents(sentences)

        if not pad_to_max_length:
            max_length = min(max_length, max(map(len, tokens)))

        tokens = [['[CLS]'] + s + ['[SEP]'] + ['[PAD]'] * (max_length - len(s)) \
                     if len(s) < max_length \
                     else ['[CLS]'] + s[:max_length] + ['[SEP]'] \
                  for s in tokens ]

        ids = [[self.word2idx.get(w, self.word2idx['[UNK]']) for w in sent] for sent in tokens]

        return torch.tensor(ids).long()

In [6]:
word2idx = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 3, '[SEP]': 4}
idx2word = {0: '[PAD]', 1: '[UNK]', 3: '[CLS]', 4: '[SEP]'}
tokenizer = RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+')

for sent in tqdm(dataset_train.values[:, 1]):
    for word in tokenizer.tokenize(sent):
        if word not in word2idx:
            word2idx[word] = word2idx.__len__()
            idx2word[word2idx.__len__() - 1] = word

100%|██████████| 8061/8061 [00:00<00:00, 266201.75it/s]


In [7]:
tokenizer = Tokenizer(word2idx, tokenizer)
train_data_sent = tokenizer(dataset_train.values[:, 1])
test_data_sent = tokenizer(dataset_test.values[:, 1])

In [8]:
dataset_train_pt = torch.utils.data.TensorDataset(
    train_data_sent, train_data_sent)
dataset_test_pt = torch.utils.data.TensorDataset(
    test_data_sent, test_data_sent)

batch_size = 64

train_dataloader = torch.utils.data.DataLoader(dataset_train_pt, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(dataset_test_pt, batch_size=batch_size)

### General training code

Самый обычный код обучения моделей за исключениесм нескольких специфичных изменений.

In [9]:
def print_sentence(sent):
    sent = [idx2word[word] for word in sent]
    return ' '.join(sent)

In [10]:
def train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function):
    model.train()
    model.zero_grad()

    input, target = batch_of_x.to(device), batch_of_y.to(device)
    output = model(src=input, target=target)

    loss = loss_function(output.transpose(1, 2), target)
    loss.backward()
    optimizer.step()

    return loss.cpu().item()

In [11]:
def train_epoch(train_generator, model, loss_function, optimizer):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        local_loss = train_on_batch(
            model, batch_of_x, batch_of_y, optimizer, loss_function)

        epoch_loss += local_loss * len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss / total

In [12]:
def test_model(model, dataloader):
    batch = next(iter(dataloader))
    input, truth = batch[0].to(device), batch[1].to(device)
    model.eval()

    with torch.no_grad():
        out = model(input, truth)
        out, truth = torch.argmax(out.detach().cpu(), 2).numpy(), truth.detach().cpu().numpy()
        result = ''
        for i in range(min(out.shape[0], 8)):
            result += 'Result: ' + print_sentence(out[i]) + ', Truth: ' + print_sentence(truth[i]) + '\n'

    return result

In [13]:
def quality_of_train(model, test_dataloader, loss_function):
    model.eval()
    with torch.no_grad():
        epoch_loss = 0
        total = 0
        for batch_of_x, batch_of_y in test_dataloader:
            input, target = batch_of_x.to(device), batch_of_y.to(device)
            output = model(src=input, target=target)

            loss = loss_function(output.transpose(1, 2), target).cpu().item()
            epoch_loss += loss * len(batch_of_x)
            total += len(batch_of_x)

    return epoch_loss / total

In [14]:
def trainer(count_of_epoch,
            model,
            train_dataloader,
            test_dataloader,
            loss_function,
            optimizer, scheduler=None, writer=None):

    iterations = range(count_of_epoch)
    if writer is not None:
        writer.add_scalar('Loss/test', quality_of_train(model, test_dataloader, loss_function), 0)
        writer.add_text('text', test_model(model, test_dataloader), 0)

    for it in iterations:
        optima = optimizer

        epoch_loss = train_epoch(
            train_generator=train_dataloader,
            model=model,
            loss_function=loss_function,
            optimizer=optima)

        if writer is not None:
            writer.add_scalar('Loss/train', epoch_loss, it + 1)
            writer.add_scalar('Loss/test', quality_of_train(model, test_dataloader, loss_function), it + 1)
            writer.add_text('text', test_model(model, test_dataloader), it + 1)

        if scheduler is not None:
            scheduler.step()

### Autoencoder model

In [15]:
class Encoder(torch.nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout, bidirectional):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.bidirectional = bidirectional

        self.embedding = torch.nn.Embedding(input_dim, emb_dim)
        self.rnn = torch.nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))

        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [16]:
class Decoder(torch.nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, bidirectional):
        super().__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.bidirectional = bidirectional

        self.embedding = torch.nn.Embedding(output_dim, emb_dim)
        self.rnn = torch.nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        self.fc_out = torch.nn.Linear(hidden_dim * (2 if self.bidirectional else 1), output_dim)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [17]:
class Autoencoder(torch.nn.Module):
    def __init__(self, vocabulary_size, emb_dim, hidden_dim, n_layers, dropout, bidirectional, device):
        super().__init__()

        self.device = device

        self.encoder = Encoder(input_dim=vocabulary_size, emb_dim=emb_dim,
                               hidden_dim=hidden_dim, n_layers=n_layers,
                               dropout=dropout, bidirectional=bidirectional).to(device)
        self.decoder = Decoder(output_dim=vocabulary_size, emb_dim=emb_dim,
                               hidden_dim=hidden_dim, n_layers=n_layers,
                               dropout=dropout, bidirectional=bidirectional).to(device)

    def forward(self, src, target, teacher_forcing_ratio=0.5):
        batch_size = target.shape[1]
        target_len = target.shape[0]
        outputs = torch.zeros(target_len, batch_size, self.decoder.output_dim).to(self.device)
        hidden, cell = self.encoder(src)
        input = target[0, :]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            input = target[t] if teacher_force else output.argmax(1)

        return outputs

### Autoencoder training

Выполним простой перебор параметров модели.

In [18]:
grid = ParameterGrid({
    'vocabulary_size': [len(word2idx)],
    'emb_dim': [64, 128],
    'hidden_dim': [128, 256],
    'n_layers': [1, 2],
    'dropout': [0.0, 0.2],
    'bidirectional': [True, False]
})

In [28]:
scores = dict()

for item in tqdm(grid, leave=True):
    print(str(item))

    writer = SummaryWriter('experiment/' + str(item))

    model = Autoencoder(**item, device=device)
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=word2idx['[PAD]'])
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

    trainer(count_of_epoch=4,
            model=model,
            train_dataloader=train_dataloader,
            test_dataloader=test_dataloader,
            loss_function=loss_function,
            optimizer=optimizer,
            scheduler=scheduler,
            writer=writer)

    loss = quality_of_train(model, test_dataloader, loss_function)
    scores[str(item)] = loss
    writer.add_hparams(item, {'hparam/Test loss': loss})
    writer.flush()

  0%|          | 0/32 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 64, 'hidden_dim': 128, 'n_layers': 1, 'vocabulary_size': 18853}


  3%|▎         | 1/32 [00:59<30:56, 59.90s/it]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 64, 'hidden_dim': 128, 'n_layers': 2, 'vocabulary_size': 18853}


  6%|▋         | 2/32 [02:18<35:29, 70.98s/it]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 64, 'hidden_dim': 256, 'n_layers': 1, 'vocabulary_size': 18853}


  9%|▉         | 3/32 [03:42<37:02, 76.65s/it]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 64, 'hidden_dim': 256, 'n_layers': 2, 'vocabulary_size': 18853}


 12%|█▎        | 4/32 [05:32<41:56, 89.89s/it]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 128, 'hidden_dim': 128, 'n_layers': 1, 'vocabulary_size': 18853}


 16%|█▌        | 5/32 [06:39<36:45, 81.70s/it]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 128, 'hidden_dim': 128, 'n_layers': 2, 'vocabulary_size': 18853}


 19%|█▉        | 6/32 [07:57<34:49, 80.38s/it]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 128, 'hidden_dim': 256, 'n_layers': 1, 'vocabulary_size': 18853}


 22%|██▏       | 7/32 [09:19<33:48, 81.12s/it]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 128, 'hidden_dim': 256, 'n_layers': 2, 'vocabulary_size': 18853}




{'bidirectional': True, 'dropout': 0.2, 'emb_dim': 64, 'hidden_dim': 128, 'n_layers': 1, 'vocabulary_size': 18853}


 28%|██▊       | 9/32 [12:14<31:21, 81.79s/it]

{'bidirectional': True, 'dropout': 0.2, 'emb_dim': 64, 'hidden_dim': 128, 'n_layers': 2, 'vocabulary_size': 18853}


 31%|███▏      | 10/32 [13:33<29:43, 81.07s/it]

{'bidirectional': True, 'dropout': 0.2, 'emb_dim': 64, 'hidden_dim': 256, 'n_layers': 1, 'vocabulary_size': 18853}


 34%|███▍      | 11/32 [15:00<28:58, 82.79s/it]

{'bidirectional': True, 'dropout': 0.2, 'emb_dim': 64, 'hidden_dim': 256, 'n_layers': 2, 'vocabulary_size': 18853}


 38%|███▊      | 12/32 [16:52<30:37, 91.89s/it]

{'bidirectional': True, 'dropout': 0.2, 'emb_dim': 128, 'hidden_dim': 128, 'n_layers': 1, 'vocabulary_size': 18853}


 41%|████      | 13/32 [17:55<26:18, 83.07s/it]

{'bidirectional': True, 'dropout': 0.2, 'emb_dim': 128, 'hidden_dim': 128, 'n_layers': 2, 'vocabulary_size': 18853}


 44%|████▍     | 14/32 [19:23<25:22, 84.58s/it]

{'bidirectional': True, 'dropout': 0.2, 'emb_dim': 128, 'hidden_dim': 256, 'n_layers': 1, 'vocabulary_size': 18853}


 47%|████▋     | 15/32 [20:45<23:44, 83.78s/it]

{'bidirectional': True, 'dropout': 0.2, 'emb_dim': 128, 'hidden_dim': 256, 'n_layers': 2, 'vocabulary_size': 18853}


 50%|█████     | 16/32 [22:40<24:48, 93.04s/it]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 64, 'hidden_dim': 128, 'n_layers': 1, 'vocabulary_size': 18853}


 53%|█████▎    | 17/32 [23:28<19:52, 79.47s/it]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 64, 'hidden_dim': 128, 'n_layers': 2, 'vocabulary_size': 18853}


 56%|█████▋    | 18/32 [24:21<16:42, 71.58s/it]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 64, 'hidden_dim': 256, 'n_layers': 1, 'vocabulary_size': 18853}


 59%|█████▉    | 19/32 [25:16<14:24, 66.53s/it]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 64, 'hidden_dim': 256, 'n_layers': 2, 'vocabulary_size': 18853}


 62%|██████▎   | 20/32 [26:34<14:02, 70.19s/it]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 128, 'hidden_dim': 128, 'n_layers': 1, 'vocabulary_size': 18853}


 66%|██████▌   | 21/32 [27:23<11:40, 63.67s/it]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 128, 'hidden_dim': 128, 'n_layers': 2, 'vocabulary_size': 18853}


 69%|██████▉   | 22/32 [28:19<10:15, 61.51s/it]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 128, 'hidden_dim': 256, 'n_layers': 1, 'vocabulary_size': 18853}


 72%|███████▏  | 23/32 [31:25<14:49, 98.79s/it]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 128, 'hidden_dim': 256, 'n_layers': 2, 'vocabulary_size': 18853}


 75%|███████▌  | 24/32 [32:31<11:51, 88.91s/it]

{'bidirectional': False, 'dropout': 0.2, 'emb_dim': 64, 'hidden_dim': 128, 'n_layers': 1, 'vocabulary_size': 18853}


 78%|███████▊  | 25/32 [33:13<08:44, 74.86s/it]

{'bidirectional': False, 'dropout': 0.2, 'emb_dim': 64, 'hidden_dim': 128, 'n_layers': 2, 'vocabulary_size': 18853}


 81%|████████▏ | 26/32 [34:03<06:44, 67.34s/it]

{'bidirectional': False, 'dropout': 0.2, 'emb_dim': 64, 'hidden_dim': 256, 'n_layers': 1, 'vocabulary_size': 18853}


 84%|████████▍ | 27/32 [34:51<05:07, 61.54s/it]

{'bidirectional': False, 'dropout': 0.2, 'emb_dim': 64, 'hidden_dim': 256, 'n_layers': 2, 'vocabulary_size': 18853}


 88%|████████▊ | 28/32 [35:58<04:12, 63.12s/it]

{'bidirectional': False, 'dropout': 0.2, 'emb_dim': 128, 'hidden_dim': 128, 'n_layers': 1, 'vocabulary_size': 18853}


 91%|█████████ | 29/32 [36:41<02:51, 57.09s/it]

{'bidirectional': False, 'dropout': 0.2, 'emb_dim': 128, 'hidden_dim': 128, 'n_layers': 2, 'vocabulary_size': 18853}


 94%|█████████▍| 30/32 [37:31<01:50, 55.02s/it]

{'bidirectional': False, 'dropout': 0.2, 'emb_dim': 128, 'hidden_dim': 256, 'n_layers': 1, 'vocabulary_size': 18853}


 97%|█████████▋| 31/32 [38:20<00:53, 53.38s/it]

{'bidirectional': False, 'dropout': 0.2, 'emb_dim': 128, 'hidden_dim': 256, 'n_layers': 2, 'vocabulary_size': 18853}


100%|██████████| 32/32 [39:28<00:00, 74.01s/it]


### Conclusions

Модель дала не очень хорошие результаты: loss равен около 6 и не сходится к нулю при проходе по эпохам. 
Предложения получаются не сильно осмысленные (не очень понятные или вообще не имеющие смысл).

Перебор параметров практически ни на что не влияет, их влиянение не очень значительно: dropout немного улучшает сходимость, количество и число слоев не сильно влияют на модель.
Увеличение размерности embedding'а уменьшает loss, но незначительно.