In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm
import numpy as np
from torchaudio.functional import edit_distance

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
import pandas as pd

data = pd.read_csv('twitter.csv')
data = data[data[['tag', 'message']].notnull().all(1)]
data.head()

Unnamed: 0,tag,message
0,0.0,is so sad for my APL friend.............
1,0.0,I missed the New Moon trailer...
2,1.0,omg its already 7:30 :O
3,0.0,.. Omgaga. Im sooo im gunna CRy. I've been at...
4,0.0,i think mi bf is cheating on me!!! T_T


Автоэнкодер для языка логично делать с помощью рекурентных сетей, поскольку они могут кодировать в представление заранее фиксированного размера последовательности любой длины. Так и поступим.

In [3]:
from transformers import AutoTokenizer, AutoModel

# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE") 
#Не используем этот, поскольку длина словаря >500000, возьмем какой-нибудь нормальный:

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.pad_token_id = tokenizer.eos_token_id

In [4]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.05, shuffle=True)

class MyDataset(Dataset):
    def __init__(self, data):
        self.messages = data["message"].to_numpy()
        self.max_length = max_length
    def __len__(self):
        return len(self.messages)
    def __getitem__(self, idx):
        return self.messages[idx], self.messages[idx]


max_length = 100
train_dataset = MyDataset(data=train_data)
test_dataset = MyDataset(data=test_data)

Напишем классы для нашей модели

In [5]:
class Encoder(nn.Module):
    def __init__(self, 
                 dict_size,
                 output_dim=128, # == 2 * layer_size 
                 embedding_dim=128,
                 n_layers=1, 
                 batch_norm=False,
                 dropout=0):      
        super().__init__()

        self.embedding = nn.Embedding(dict_size, embedding_dim)
        self.n_layers = n_layers
        self.output_dim = output_dim
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_size=output_dim//2, 
                            num_layers=n_layers,
                            batch_first=True,
                            dropout=dropout)
        
        self.batch_norm = nn.BatchNorm2d(n_layers) if batch_norm else False
        
        
    def forward(self, x):
        
        x = self.embedding(x)
 
        _, (h_n, c_n) = self.lstm(x)

        out = torch.cat([torch.permute(h_n, (1, 0, 2)), torch.permute(c_n, (1, 0, 2))], -1)
        if self.batch_norm:
            out = self.batch_norm(out.unsqueeze(2)).view(x.shape[0], self.n_layers, -1)
        
        return out


class Decoder(nn.Module):
    def __init__(self, 
                 dict_size,
                 input_dim=128, # == 2 * layer_size 
                 embedding_dim=128,
                 n_layers=1, 
                 dropout=0,
                 max_len=100, 
                 bos_token_id=1):      
        super().__init__()

        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_size=input_dim//2, 
                            num_layers=n_layers,
                            batch_first=True,
                            dropout=dropout)
        self.linear = nn.Linear(input_dim // 2, dict_size)
        
        self.max_len = max_len
        self.embedding = nn.Embedding(dict_size, embedding_dim)
        self.bos_token_id = bos_token_id
        self.input_dim = input_dim
        
    def forward(self, encoder_output, max_len):
        
        h = torch.permute(encoder_output[:, :, :self.input_dim//2], (1,0,2)).contiguous()
        c = torch.permute(encoder_output[:, :, self.input_dim//2:], (1,0,2)).contiguous()

        cur_token_emb = self.embedding(torch.empty(encoder_output.shape[0], 1).fill_(self.bos_token_id).int().to(device))
        
        logps = []
        
        while len(logps) < max_len:
            output, (h, c) = self.lstm(cur_token_emb, (h, c))
            next_logp = F.log_softmax(self.linear(output), dim=-1)
            logps.append(next_logp)

            cur_token_emb = self.embedding(torch.argmax(next_logp.detach(), dim=-1))

        return torch.cat(logps, dim=1)


class LSTMAutoencoder(nn.Module):
    def __init__(self, 
                 tokenizer, 
                 hidden_dim=128,
                 embedding_dim=128,
                 n_layers=1, 
                 batch_norm=False,
                 dropout=0,
                 max_len=100):      
        super().__init__()

        self.tokenizer = tokenizer
        self.dict_size = len(tokenizer)
        self.bos_token_id = tokenizer.bos_token_id
        
        self.encoder = Encoder(dict_size=self.dict_size,
                               output_dim=hidden_dim,
                               embedding_dim=embedding_dim,
                               n_layers=n_layers, 
                               batch_norm=batch_norm,
                               dropout=dropout)     
        
        self.decoder = Decoder(dict_size=self.dict_size,
                               input_dim=hidden_dim, 
                               embedding_dim=embedding_dim,
                               n_layers=n_layers, 
                               dropout=dropout,
                               max_len=max_len, 
                               bos_token_id=tokenizer.bos_token_id)

    def forward(self, inp_tokens):
        max_len = inp_tokens.shape[-1]
        middle = self.encoder(inp_tokens)
        output = self.decoder(middle, max_len)
        return output

Следующая ячейка как в первой задаче скопирована с семинара с небольшими изменениями.

В качестве метрики как будто разумно использовать минимальное расстояние редактирования, попробуем так и сделать. Нужно только не забывать, что в данном случае меньше -- лучше.

In [6]:
def train_epoch(train_loader, model, loss_function, optimizer, callback=None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(tqdm(train_loader, leave=False)):
        batch_of_x, batch_of_y = model.tokenizer(batch_of_x, padding=True, truncation=True, max_length=max_length,
                                 return_tensors="pt")["input_ids"], \
                                 model.tokenizer(batch_of_y, padding=True, truncation=True, max_length=max_length,
                                 return_tensors="pt")["input_ids"]
                            
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)
        
        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)
            
        epoch_loss += batch_loss * len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss / total


def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()
    preds = model(x_batch.to(device))
    loss = loss_function(preds, y_batch.to(device))
    loss.backward()

    optimizer.step()
    return loss.detach().cpu().item()


def trainer(count_of_epoch, 
            batch_size, 
            loader,
            model, 
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):

    optima = optimizer(model.parameters(), lr=lr)
    
    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})
    for it in iterations:
        
        
        epoch_loss = train_epoch(train_loader=loader, 
                    model=model, 
                    loss_function=loss_function,
                    optimizer=optima, 
                    callback=callback)
        
        iterations.set_postfix({'train epoch loss': epoch_loss})


class Callback():
    def __init__(self, writer, test_loader, loss_function, delimeter=100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size

        self.loader = test_loader

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        
        if self.step % self.delimeter == 0:
            
            pred = []
            real = []
            model.eval()
            with torch.no_grad():
                for it, (x_batch, y_batch) in enumerate(tqdm(self.loader, leave=False)):
                    x_batch, y_batch = model.tokenizer(x_batch, padding=True, truncation=True, max_length=max_length,
                                     return_tensors="pt")["input_ids"], \
                                       model.tokenizer(y_batch, padding=True, truncation=True, max_length=max_length,
                                     return_tensors="pt")["input_ids"]
                    
                    x_batch = x_batch.to(device)
    
                    output = model(x_batch).detach()
    
                    pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
                    real.extend(y_batch.numpy().tolist())
                    
                test_edit_disctance = np.mean([edit_distance(pred_sent, real_sent) for \
                                               pred_sent, real_sent in zip(pred, real)])
                
                self.writer.add_scalar('Edit_Disctance/test', test_edit_disctance, self.step)

          
    def __call__(self, model, loss):
        return self.forward(model, loss)

In [7]:
%load_ext tensorboard
%tensorboard --logdir ./ --port=6002

Переберем параметры аналогично первой задаче, начнем с размера скрытого представления (он же -- размер скрытых представлений в lstm).
Обучать будем на всех данных, но только одну эпоху -- слишком долго работает.

In [8]:
class LSTM_loss():
    '''
    regular torch.nn.functional.nll_loss addapted for (batch_size, sequence_length, vocab_size) shape inputs
    '''
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        
    def __call__(self, pred, target):

        return F.nll_loss(pred.view(-1, self.vocab_size), target.view(-1))
        

loss_function = LSTM_loss(vocab_size=len(tokenizer))

optimizer = torch.optim.Adam
lr = 1e-3
hidden_dims = [32,64,128,256,512]
batch_size = 50
test_step_size = 3000
n_epochs=1

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)


for hidden_dim in hidden_dims:

    model = LSTMAutoencoder(tokenizer, 
                            hidden_dim=hidden_dim,
                            embedding_dim=64,
                            n_layers=2, 
                            batch_norm=False,
                            dropout=0,
                            max_len=100).to(device)
    
    writer = SummaryWriter(log_dir=f'different_hidden_dims/{hidden_dim}')

    callback = Callback(writer, test_loader, loss_function, delimeter=test_step_size)

    trainer(count_of_epoch=n_epochs, 
            batch_size=batch_size, 
            loader=train_loader,
            model=model, 
            loss_function=loss_function,
            optimizer=optimizer,
            lr=lr,
            callback=callback)

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/29994 [00:00<?, ?it/s]


KeyboardInterrupt



БОльшее скрытое представление закономерно дает лучший скор, поскольку данных довольно много и задача сложная.
Теперь попробуем добавить batchnorm между энкодером и декодером (а куда его еще пихать?)

In [8]:
batch_norms = [True, False]
batch_size = 40
test_step_size = 3600
n_epochs=1

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)


for batch_norm in batch_norms:

    model = LSTMAutoencoder(tokenizer, 
                            hidden_dim=512,
                            embedding_dim=64,
                            n_layers=2, 
                            batch_norm=batch_norm,
                            dropout=0,
                            max_len=100).to(device)
    
    writer = SummaryWriter(log_dir=f'batch_norms/{batch_norm}')

    callback = Callback(writer, test_loader, loss_function, delimeter=test_step_size)

    trainer(count_of_epoch=n_epochs, 
            batch_size=batch_size, 
            loader=train_loader,
            model=model, 
            loss_function=loss_function,
            optimizer=optimizer,
            lr=lr,
            callback=callback)

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/37492 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/37492 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

Не такая ожидаемая картина -- результат с batch_norm-ом заметно хуже, неочевидно, но допустим, далее не будем его использовать.

Теперь посмотрим дают ли прирост используемые все время до этого 2 слоя LSTM относительного одного. (Больше будет учиться хуже и дольше, а эксперименты итак занимают довольно много времени)

In [9]:
batch_size = 40
n_layers_options = [1, 2]
test_step_size = 3600
n_epochs=1

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)


for n_layers in n_layers_options:

    model = LSTMAutoencoder(tokenizer, 
                            hidden_dim=512,
                            embedding_dim=64,
                            n_layers=n_layers, 
                            batch_norm=False,
                            dropout=0,
                            max_len=100).to(device)
    
    writer = SummaryWriter(log_dir=f'different_number_of_layers/{n_layers}')

    callback = Callback(writer, test_loader, loss_function, delimeter=test_step_size)

    trainer(count_of_epoch=n_epochs, 
            batch_size=batch_size, 
            loader=train_loader,
            model=model, 
            loss_function=loss_function,
            optimizer=optimizer,
            lr=lr,
            callback=callback)

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/37492 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/37492 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

  0%|          | 0/1974 [00:00<?, ?it/s]

Оверфита выше не видно, поэтому едва ли добавление dropout-а чем-то поможет, но все таки проверим

In [12]:
batch_size = 40
dropouts = [0, 0.15, 0.3]
test_step_size = 3600
n_epochs=1

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)


for dropout in dropouts:

    model = LSTMAutoencoder(tokenizer, 
                            hidden_dim=512,
                            embedding_dim=64,
                            n_layers=2, 
                            batch_norm=False,
                            dropout=dropout,
                            max_len=100).to(device)
    
    writer = SummaryWriter(log_dir=f'dropouts/{dropout}')

    callback = Callback(writer, test_loader, loss_function, delimeter=test_step_size)

    trainer(count_of_epoch=n_epochs, 
            batch_size=batch_size, 
            loader=train_loader,
            model=model, 
            loss_function=loss_function,
            optimizer=optimizer,
            lr=lr,
            callback=callback)

собственно как и ожидалось dropout делает только слегка хуже на первой эпохе, есть ли улучшения дальше увидеть не представляется возможным, поскольку этот ноутбук в сумме уже итак работал около 20 часов -- хватит с этой задачи.