In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Sampler
import numpy as np
import matplotlib.pyplot as plt
import string
from datasets import load_dataset
from transformers import AutoTokenizer

In [6]:
seed = 1234
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
def filter_seq(e):
    seq = e['text']
    if len(seq) < 499 and len(seq) > 10:
        return True
    return False
dataset = load_dataset("large_spanish_corpus", "all_wikis", split='train').shuffle(seed=seed)
dataset = dataset.train_test_split(train_size=100000, test_size=1000)
dataset = dataset.filter(filter_seq)

Reusing dataset large_spanish_corpus (/home/rodrigo/.cache/huggingface/datasets/large_spanish_corpus/all_wikis/1.1.0/f71a935424f00d2356deff29366f4b499ce0e22957180f5420da5acbbb50e2ec)
Loading cached shuffled indices for dataset at /home/rodrigo/.cache/huggingface/datasets/large_spanish_corpus/all_wikis/1.1.0/f71a935424f00d2356deff29366f4b499ce0e22957180f5420da5acbbb50e2ec/cache-b90856c99c6bfc66.arrow


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [9]:
special_characters = ['<p>', '<s>', '</s>', '<uk>']
vocab = string.ascii_letters + '1234567890áéíóú' + '.,:;-<>"" '
vocab = special_characters + list(vocab)
max_length = 500

def tokenize(seq, pad=False, max_length=None):
    tokens = [1] #sos
    for char in seq:
        try:
            index = vocab.index(char)
            tokens.append(index)
        except:
            tokens.append(3) #ukn
    tokens.append(2) #eos
    lengths = [1 for _ in tokens]
    if pad:
        pads = max_length - len(tokens)
        tokens += [0 for _ in range(pads)]
        lengths += [0 for _ in range(pads)]
    return {'input_ids': torch.tensor(tokens)}

def encode(e):
    return tokenize(e['text'], pad=True, max_length=500)

In [10]:
vocab_size = len(vocab)
batch_size = 32
dataset = dataset.map(encode)
dataset.set_format(type='torch', columns=['input_ids'])

HBox(children=(FloatProgress(value=0.0, max=91239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=901.0), HTML(value='')))




In [11]:
dataloader = torch.utils.data.DataLoader(dataset['train'], 
                                         batch_size=batch_size, shuffle=True)
batch = next(iter(dataloader))
batch

  return torch.tensor(x, **format_kwargs)


{'input_ids': tensor([[ 1, 30, 15,  ...,  0,  0,  0],
         [ 1, 44,  6,  ...,  0,  0,  0],
         [ 1, 48, 12,  ...,  0,  0,  0],
         ...,
         [ 1, 34, 17,  ...,  0,  0,  0],
         [ 1, 30, 80,  ...,  0,  0,  0],
         [ 1, 34, 15,  ...,  0,  0,  0]])}

In [12]:
class LMGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers, dropout=0.1):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, n_layers, dropout=dropout)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, x, last_hidden=None):
        x = self.embeddings(x)
        #x = nn.utils.rnn.pack_padded_sequence(x, lengths,
                                            # enforce_sorted=False)
        out, h = self.gru(x, last_hidden)
        #out, _ = nn.utils.rnn.pad_packed_sequence(out)
        out = self.out(out)
        out = self.softmax(out)
        return out, h

In [13]:
lstm = LMGRU(vocab_size, 512, 128, 8, 0.1)
o, h = lstm(batch['input_ids'].transpose(0,1),)

In [1]:
@torch.no_grad()
def validate(model, criterion, loader):
    loss_sum = 0
    total_loss = 0
    for i, example in enumerate(loader):
        x, target = example['input_ids'].transpose(0,1)[:-1], example['input_ids'].transpose(0,1)[1:]
        x, target = x.to(device), target.to(device)
        hidden = None
        loss = 0
        topk = torch.ones(1, x.size(1), 1).to(device)
        for t in range(0, x.size(0)):
            output, hidden = model(x[t,:].unsqueeze(0),
                               hidden)
            l = criterion(output.squeeze(0), target[t, :])
            loss += l  
            topk_v, topk_i = output.topk(1, dim=2)
            topk = torch.cat((topk,topk_i), dim=0)
        loss_sum += loss/x.size(0)
        total_loss += 1
    #calculate metrics
    final_loss = loss_sum / total_loss
    perplexity = torch.exp(final_loss)
    
    #detokenize some sentence
    batch_example = random.randint(0,x.size(1)-1)
    input_sentence = detokenize(x.transpose(0,1)[0,:])
    output_sentence = detokenize(topk.transpose(0,1)[0, :])
    
    #print everything
    validation_info = """-----------------------------------------------------
    Validation:
    loss: %.4f, perplexity: %.4f
    input sentence: %s
    output sentence: %s
    ------------------------------------------------
    """ % (final_loss, perplexity, input_sentence, output_sentence)
    print(validation_info)
    return final_loss, perplexit

NameError: name 'torch' is not defined

In [14]:
def train_step(model, criterion, optimizer, inputs):
    x, target = inputs
    x, target = x.to(device), target.to(device)
    #loss_lengths = lengths.to(device)
    hidden = None
    
    model.zero_grad()
    loss = 0
    output = x[0:1,:]
    for t in range(0, x.size(0)):
        output, hidden = model(x[t,:].unsqueeze(0),
                               hidden)
        l = criterion(output.squeeze(0), target[t, :])
        loss += l
    loss.backward()
    optimizer.step()
    
    return output, loss/x.size(0)

In [15]:
def train(model, criterion, optimizer, loader, epochs, print_every=10, save_every=1000):
    for epoch in range(1,epochs+1):
        for i, example in enumerate(loader):
            inputs = example['input_ids'].transpose(0,1)[:-1], example['input_ids'].transpose(0,1)[1:]
            #lengths = example['lengths'].transpose(0,1)
            output, loss = train_step(model, criterion, optimizer, inputs)
            if i % print_every == 0:
                print('epoch: %.d, iter: %.d, loss: %.4f' % 
                     (epoch, i, loss))
            if i % save_every == 0:
                torch.save({
                    'iteration': i,
                    'epoch': epoch,
                    'model': model.state_dict()
                }, './models/small-test/{}_{}.tar'.format(epoch,i))
def one_batch(model, criterion, optimizer, batch, iters):
    inputs = batch['input_ids'].transpose(0,1)[:-1], batch['input_ids'].transpose(0,1)[1:]
    #lengths = batch['lengths'].transpose(0,1)
    for i in range(iters):
        output, loss = train_step(model, criterion, optimizer, inputs)
        if i % 1 == 0:
            print('iter: %.d, loss: %.4f' % 
                 (i, loss))
        

In [None]:
lr = 1e-3

model = LMGRU(vocab_size, 128, 64, 8, 0.1).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.NLLLoss()

def maskNLLLoss(output, target, lengths):
    print(lengths)
    print(target)
    total = lengths.sum()
    cross_entropy = -torch.log(torch.gather(output, 1, target.view(-1,1)).squeeze(1))
    print(cross_entropy)
    loss = (cross_entropy*lengths).sum() / total
    loss = loss.to(device)
    return loss

#one_batch(model, criterion, optimizer, batch, 500) # overfit one batch
train(model, criterion, optimizer, dataloader, 2, 10) # train the model

epoch: 1, iter: 0, loss: 3.4660
epoch: 1, iter: 10, loss: 3.3504
epoch: 1, iter: 20, loss: 3.3054
epoch: 1, iter: 30, loss: 3.2551
epoch: 1, iter: 40, loss: 3.2251
epoch: 1, iter: 50, loss: 3.1872
epoch: 1, iter: 60, loss: 3.2235
epoch: 1, iter: 70, loss: 3.1042
epoch: 1, iter: 80, loss: 3.1756
epoch: 1, iter: 90, loss: 3.2290
epoch: 1, iter: 100, loss: 3.2030
epoch: 1, iter: 110, loss: 3.1262
epoch: 1, iter: 120, loss: 3.1847
epoch: 1, iter: 130, loss: 3.2515
epoch: 1, iter: 140, loss: 3.1146
epoch: 1, iter: 150, loss: 3.1223
epoch: 1, iter: 160, loss: 3.1166
epoch: 1, iter: 170, loss: 3.1083
epoch: 1, iter: 180, loss: 3.0865
epoch: 1, iter: 190, loss: 3.0430
epoch: 1, iter: 200, loss: 3.1427
epoch: 1, iter: 210, loss: 3.0463
epoch: 1, iter: 220, loss: 3.1214
epoch: 1, iter: 230, loss: 3.0082
epoch: 1, iter: 240, loss: 2.9451
epoch: 1, iter: 250, loss: 3.0754
epoch: 1, iter: 260, loss: 3.0035
epoch: 1, iter: 270, loss: 3.0120
epoch: 1, iter: 280, loss: 2.9697
epoch: 1, iter: 290, loss

In [13]:
tokenizer = AutoTokenizer.from_pretrained('xlnet-large-cased', keep_accents=True, bos_token='<s>', eos_token='</s>',unk_token='<u>', pad_token='<p>', padding_side='left')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382015.0, style=ProgressStyle(descript…

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.





In [14]:
def encode(e):
    return tokenizer(e['text'], add_special_tokens=True,  padding='longest')
dataset = dataset.map(encode)

HBox(children=(FloatProgress(value=0.0, max=911021.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=901.0), HTML(value='')))




In [15]:
dataset['train'][0]

{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'input_ids': [2073,
  1182,
  1284,
  46,
  12806,
  22184,
  2483,
  17,
  6884,
  556,
  3879,
  17,
  12,
  874,
  4287,
  12,
  19,
  17,
  7522,
  4304,
  17,
  254,
  2483,
  2605,
  101,
  321,
  13953,
  3423,
  202,
  17,
  117,
  4425,
  13792,
  5074,
  11760,
  772,
  5894,
  1868,
  150,
  9,
  4,
  3],
 'text': 'En 2011 protagonizó la película "Hick", basada en la novela de Andrea Portes y dirigida por Derick Martini.',
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2]}

In [18]:
dataset['train'][1]['input_ids'].size()

torch.Size([18])