In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm
import numpy as np
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
import pandas as pd

data = pd.read_csv('twitter.csv')
data.head()

Unnamed: 0,tag,message
0,0.0,is so sad for my APL friend.............
1,0.0,I missed the New Moon trailer...
2,1.0,omg its already 7:30 :O
3,0.0,.. Omgaga. Im sooo im gunna CRy. I've been at...
4,0.0,i think mi bf is cheating on me!!! T_T


Автоэнкодер для языка логично делать с помощью рекурентных сетей, поскольку они могут кодировать в представление заранее фиксированного размера последовательности любой длины. Так и поступим.

In [14]:
from transformers import AutoTokenizer, AutoModel

# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE") Не используем поскольку длина словаря >500000

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.pad_token_id = tokenizer.eos_token_id

In [20]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.1, shuffle=True)

class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=100):
        self.messages = data["message"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        
        return self.tokenizer([self.messages[idx]], padding=True, 
                              truncation=True, max_length=max_length,
                              return_tensors="pt")["input_ids"]


max_length = 100
train_dataset = MyDataset(train_data, tokenizer, max_length=max_length)
test_dataset = MyDataset(test_data, tokenizer, max_length=max_length)

In [23]:
class Encoder(nn.Module):
    def __init__(self, 
                 dict_size,
                 output_dim=128, # == 2 * layer_size 
                 embedding_dim=128,
                 n_layers=1, 
                 batch_norm=False,
                 dropout=0):      
        super().__init__()

        self.embedding = nn.Embedding(dict_size, embedding_dim)
        self.n_layers = n_layers
        self.output_dim = output_dim
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_size=output_dim//2, 
                            num_layers=n_layers,
                            batch_first=True,
                            dropout=dropout)
        
        self.batch_norm = nn.BatchNorm1d(output_dim) if batch_norm else False
        
        
    def forward(self, x):
        x = self.embedding(x)
        
        _, (h_n, c_n) = self.lstm(x)

        out = torch.cat([torch.permute(h_n, (1, 0, 2)), torch.permute(c_n, (1, 0, 2))], -1)
        if self.batch_norm:
            out = self.batch_norm()
        
        return out


class Decoder(nn.Module):
    def __init__(self, 
                 dict_size,
                 input_dim=128, # == 2 * layer_size 
                 embedding_dim=128,
                 n_layers=1, 
                 dropout=0,
                 max_len=100, 
                 bos_token_id=1):      
        super().__init__()

        self.lstm = nn.LSTM(input_dim, 
                            hidden_size=output_dim//2, 
                            num_layers=n_layers,
                            batch_first=True,
                            proj_size=dict_size,
                            dropout=dropout)
        
        self.max_len = max_len
        self.embedding = nn.Embedding(dict_size, embedding_dim)
        self.bos_token_id = bos_token_id
        
    def forward(self, encoder_output):
        h = torch.permute(encoder_output[:, :, :input_dim//2], (1,0,2))
        c = torch.permute(encoder_output[:, :, input_dim//2:], (1,0,2))

        cur_token_emb = self.embedding(torch.empty(encoder_output.shape[0], 1).fill_(self.bos_token_id).int().to(device))

        logps = []
        
        while len(cur_seq) < self.max_len:
            
            output, h, c = self.lstm(cur_token_emb, h, c)
            next_logp = F.log_softmax(self.linear(out), dim=-1)
            logps.append(next_logp)

            cur_token_emb = self.embedding(torch.argmax(next_logp.detach(), dim=-1))

        return torch.cat(logps, dim=1)


class LSTMAutoencoder(nn.Module):
    def __init__(self, 
                 tokenizer, 
                 hidden_dim=128,
                 embedding_dim=128,
                 n_layers=1, 
                 batch_norm=False,
                 dropout=0,
                 max_len=100):      
        super().__init__()

        self.tokenizer = tokenizer
        self.dict_size = len(tokenizer)
        self.bos_token_id = tokenizer.bos_token_id
        
        self.encoder = Encoder(dict_size=self.dict_size,
                               output_dim=hidden_dim,
                               embedding_dim=embedding_dim,
                               n_layers=n_layers, 
                               batch_norm=batch_norm,
                               dropout=dropout)     
        
        self.decoder = Decoder(dict_size=self.dict_size,
                               input_dim=hidden_dim, 
                               embedding_dim=embedding_dim,
                               n_layers=n_layers, 
                               dropout=dropout,
                               max_len=max_len, 
                               bos_token_id=tokenizer.bos_token_id)

    def forward(self, inp_tokens):
        middle = self.encoder(inp_tokens)
        output = self.decoder(middle)
        return output

In [1]:
def train_epoch(train_loader, model, loss_function, optimizer, callback=None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(tqdm(train_loader, leave=False)):
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)
        
        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)
            
        epoch_loss += batch_loss * len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss / total


def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()
    preds = model(x_batch.to(device))
    loss = loss_function(preds, y_batch.to(device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()


def trainer(count_of_epoch, 
            batch_size, 
            loader,
            model, 
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):

    optima = optimizer(model.parameters(), lr=lr)
    
    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})
    for it in iterations:
        
        
        epoch_loss = train_epoch(train_loader=loader, 
                    model=model, 
                    loss_function=loss_function,
                    optimizer=optima, 
                    callback=callback)
        
        iterations.set_postfix({'train epoch loss': epoch_loss})


class Callback():
    def __init__(self, writer, test_loader, loss_function, delimeter=100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size

        self.loader = test_loader

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        
        if self.step % self.delimeter == 0:
            
            pred = []
            real = []
            model.eval()
            
            for it, (x_batch, y_batch) in enumerate(self.loader):
                x_batch = x_batch.to(device)

                output = model(x_batch)

                pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
                real.extend(y_batch.numpy().tolist())
                
            test_acc = np.mean(np.array(pred) == np.array(real))
            
            self.writer.add_scalar('ACC/test', test_acc, self.step)

          
    def __call__(self, model, loss):
        return self.forward(model, loss)

In [24]:
%load_ext tensorboard
%tensorboard --logdir ./task2 --port=6002

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


ERROR: Failed to launch TensorBoard (exited with 4294967295).
Contents of stderr:
TensorFlow installation not found - running with reduced feature set.
E0319 17:58:34.807382  9308 program.py:298] TensorBoard could not bind to port 6001, it was already in use
ERROR: TensorBoard could not bind to port 6001, it was already in use

In [None]:
class LSTM_loss():
    def __init__(self, vocab_size, ignore_index=None):
        self.vocab_size = vocab_size
        self.ignore_index = ignore_index
        
    def __call__(self, pred, target):
        
        pred_shifted = pred[:, :-1].contiguous().view(-1, self.vocab_size)
        target_shifted = target[:, 1:].contiguous().view(-1)
        
        return F.nll_loss(pred_shifted, target_shifted, 
                        ignore_index=self.ignore_index)