In [1]:
import pandas as pd
from dataset_loaders import WikipediaDataset, WikipediaTokenizerDataset 
from torch.utils.data import DataLoader
from modeling import LSTMLM, Trainer, TrainerConfig
from tokenizers import ByteLevelBPETokenizer
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers
import torch.nn as nn

In [2]:
dataset = WikipediaTokenizerDataset('./data/wikipedia_articles_en/data.parquet')
loader = DataLoader(dataset, shuffle=True, batch_size=1)

In [3]:
dataset.frame.head(20)

Unnamed: 0,id,subject,previous_blocks,text
0,0,? Nycticebus linglom<sep>Taxonomy,,<s>? Nycticebus linglom was described in 1997 ...
1,1,? Nycticebus linglom<sep>Description,,"<s>The single known tooth, a third upper molar..."
2,2,? Nycticebus linglom<sep>Range and ecology,,"<s>Li Mae Long, the collection site of ? N. li..."
3,3,? Oryzomys pliocaenicus<sep>Discovery and context,,<s>The only known specimen of ? Oryzomys plioc...
4,4,? Oryzomys pliocaenicus<sep>Description,,<s>? Oryzomys pliocaenicus is known from a sin...
5,5,? Oryzomys pliocaenicus<sep>Interpretations,,<s>Hibbard wrote that the condition of the men...
6,6,.hack (video game series)<sep>Gameplay,,<s>.hack simulates an MMORPG; players assume t...
7,7,.hack (video game series)<sep>Plot,,<s></s>
8,8,.hack (video game series)<sep>Plot<sep>Setting,,<s>The .hack games are set in an alternate tim...
9,9,.hack (video game series)<sep>Plot<sep>Characters,,"<s>The main protagonist of .hack is Kite, a ne..."


In [4]:
len(dataset)

658824

In [None]:
special_tokens = ['<s>', '</s>', '<unk>', '<pad>', '<sep>']
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.normalizer = normalizers.NFKC()
tokenizer.decoders = decoders.ByteLevel()

trainer = trainers.BpeTrainer(
    vocab_size=50000,
    min_frequency=2,
    special_tokens=special_tokens
)
tokenizer.train_from_iterator(loader, trainer=trainer)
tokenizer.enable_padding(pad_token='<pad>', pad_id=4)
tokenizer.save("./custom_tokenizers/models/testing_tokenizer")

In [5]:
tokenizer = Tokenizer.from_file("./custom_tokenizers/models/testing_tokenizer")

In [6]:
batch_size = 8
train_dataset = WikipediaDataset('./data/wikipedia_articles_en/data.parquet')
dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

model = LSTMLM(50000, 512, 1024, 2, context_dim=512, residual=True, dropout=0.1)
criterion = nn.NLLLoss()
trainer = Trainer(model, tokenizer, criterion, train_dataset, None, TrainerConfig(batch_size=8, use_context=True))
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mrorroart[0m (use `wandb login --relogin` to force relogin)


  0%|          | 0/82353 [00:00<?, ?it/s]


TypeError: cannot unpack non-iterable NoneType object

In [None]:
torch.save_model()

In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import string
from modeling import PlainLSTMLM

seed = 1234
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [44]:
def get_batch(inputs, tokenizer, device):
    encoded_inputs = tokenizer.encode_batch(inputs)
    inputs_ids = [encoded_input.ids for encoded_input in encoded_inputs]
    input_tensor = torch.LongTensor(inputs_ids).transpose(0,1)
    return input_tensor[:-1, :].to(device), input_tensor[1:, :].to(device)

In [45]:
get_batch(i, tokenizer, device)

(tensor([[    0,     0,     0,  ...,     0,     0,     0],
         [  543,  2062,   299,  ...,  1892,  6952,  9762],
         [14535, 21489,   260,  ...,   281,  8061,    16],
         ...,
         [    4,     4,   237,  ...,     4,     4,     4],
         [    4,     4,  2475,  ...,     4,     4,     4],
         [    4,     4,    18,  ...,     4,     4,     4]], device='cuda:0'),
 tensor([[  543,  2062,   299,  ...,  1892,  6952,  9762],
         [14535, 21489,   260,  ...,   281,  8061,    16],
         [   16, 42580,    17,  ...,  1273,  1514,  1720],
         ...,
         [    4,     4,  2475,  ...,     4,     4,     4],
         [    4,     4,    18,  ...,     4,     4,     4],
         [    4,     4,     1,  ...,     4,     4,     4]], device='cuda:0'))

In [56]:
@torch.no_grad()
def validate(model, criterion, loader):
    loss_sum = 0
    total_loss = 0
    for i, example in enumerate(loader):
        x, target = example['input_ids'].transpose(0,1)[:-1], example['input_ids'].transpose(0,1)[1:]
        x, target = x.to(device), target.to(device)
        hidden = None
        loss = 0
        topk = torch.ones(1, x.size(1), 1).to(device)
        for t in range(0, x.size(0)):
            output, hidden = model(x[t,:].unsqueeze(0),
                               hidden)
            l = criterion(output.squeeze(0), target[t, :])
            loss += l  
            topk_v, topk_i = output.topk(1, dim=2)
            topk = torch.cat((topk,topk_i), dim=0)
        loss_sum += loss/x.size(0)
        total_loss += 1
    #calculate metrics
    final_loss = loss_sum / total_loss
    perplexity = torch.exp(final_loss)
    
    #detokenize some sentence
    batch_example = random.randint(0,x.size(1)-1)
    input_sentence = detokenize(x.transpose(0,1)[0,:])
    output_sentence = detokenize(topk.transpose(0,1)[0, :])
    
    #print everything
    validation_info = """-----------------------------------------------------
    Validation:
    loss: %.4f, perplexity: %.4f
    input sentence: %s
    output sentence: %s
    ------------------------------------------------
    """ % (final_loss, perplexity, input_sentence, output_sentence)
    print(validation_info)
    return final_loss, perplexit

In [47]:
def train_step(model, criterion, optimizer, inputs):
    x, target = inputs
    x, target = x.to(device), target.to(device)
    #loss_lengths = lengths.to(device)
    hidden = None
    
    model.zero_grad()
    loss = 0
    output = x[0:1,:]
    for t in range(0, x.size(0)):
        output, hidden = model(x[t,:].unsqueeze(0),
                               hidden)
        l = criterion(output.squeeze(0), target[t, :])
        loss += l
    loss.backward()
    optimizer.step()
    
    return output, loss/x.size(0)

In [53]:
def train(model, tokenizer, criterion, optimizer, loader, epochs, print_every=10, save_every=1000):
    for epoch in range(1,epochs+1):
        for i, batch in enumerate(loader):
            input_batch = batch[2]
            inputs = get_batch(input_batch, tokenizer, device)
            #lengths = example['lengths'].transpose(0,1)
            output, loss = train_step(model, criterion, optimizer, inputs)
            if i % print_every == 0:
                print('epoch: %.d, iter: %.d, loss: %.4f' % 
                     (epoch, i, loss))
            if i % save_every == 0:
                torch.save({
                    'iteration': i,
                    'epoch': epoch,
                    'model': model.state_dict()
                }, './models/small-test/{}_{}.tar'.format(epoch,i))
def one_batch(model, tokenizer, criterion, optimizer, batch, iters):
    inputs = get_batch(batch[2], tokenizer, device)
    #lengths = batch['lengths'].transpose(0,1)
    for i in range(iters):
        output, loss = train_step(model, criterion, optimizer, inputs)
        if i % 1 == 0:
            print('iter: %.d, loss: %.4f' % 
                 (i, loss))
        

In [54]:
lr = 3e-4

model = PlainLSTMLM(50000, 512, 1024, 4, 0.1).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.NLLLoss()

def maskNLLLoss(output, target, lengths):
    print(lengths)
    print(target)
    total = lengths.sum()
    cross_entropy = -torch.log(torch.gather(output, 1, target.view(-1,1)).squeeze(1))
    print(cross_entropy)
    loss = (cross_entropy*lengths).sum() / total
    loss = loss.to(device)
    return loss
batch = next(iter(dataloader))
one_batch(model, tokenizer, criterion, optimizer, batch, 500) # overfit one batch
#train(model, criterion, optimizer, dataloader, 2, 10) # train the model

iter: 0, loss: 2.0796
iter: 1, loss: 2.0471
iter: 2, loss: 1.9400
iter: 3, loss: 1.8202
iter: 4, loss: 1.7865
iter: 5, loss: 1.7377
iter: 6, loss: 1.6900
iter: 7, loss: 1.6795
iter: 8, loss: 1.6270
iter: 9, loss: 1.6042
iter: 10, loss: 1.5851
iter: 11, loss: 1.5675
iter: 12, loss: 1.5511
iter: 13, loss: 1.5364
iter: 14, loss: 1.5270
iter: 15, loss: 1.5207
iter: 16, loss: 1.5150
iter: 17, loss: 1.5080
iter: 18, loss: 1.5053
iter: 19, loss: 1.5023
iter: 20, loss: 1.4992
iter: 21, loss: 1.4968
iter: 22, loss: 1.4940
iter: 23, loss: 1.4913
iter: 24, loss: 1.4889
iter: 25, loss: 1.4864
iter: 26, loss: 1.4835
iter: 27, loss: 1.4804
iter: 28, loss: 1.4770
iter: 29, loss: 1.4731
iter: 30, loss: 1.4673
iter: 31, loss: 1.4673
iter: 32, loss: 1.4646
iter: 33, loss: 1.4604
iter: 34, loss: 1.4563
iter: 35, loss: 1.4526
iter: 36, loss: 1.4467
iter: 37, loss: 1.4382
iter: 38, loss: 1.4304
iter: 39, loss: 1.4246
iter: 40, loss: 1.4115
iter: 41, loss: 1.4014
iter: 42, loss: 1.3919
iter: 43, loss: 1.379

In [77]:
with torch.no_grad():
    for i, example in enumerate(loader):

            hidden = None
            loss = 0
            topk = torch.ones(1, x.size(1), 1).to(device)
            token = ['<s>India']
            sentence = ''
            for t in range(0, 5):
                x, target = get_batch(token, tokenizer, device)
                x = x[0,:].unsqueeze(0)
                output, hidden = model(x,
                                   hidden)
                topk_v, topk_i = output.topk(1, dim=2)
                topk = torch.cat((topk,topk_i), dim=0)

KeyboardInterrupt: 

In [87]:
model.train(False)
output = model(torch.tensor([[0]], device=device))[0]
topk_v, topk_i = output.topk(2, dim=2)
topk_i

tensor([[[1, 0]]], device='cuda:0')

In [94]:
x, target = get_batch(['<s>India'], tokenizer, device)
x = tokenizer.encode('<s>')
out, hidden = model(torch.tensor([x.ids], device=device), None)
topk_v, topk_i = out.topk(1, dim=2)
topk_v

tensor([[[0.]]], device='cuda:0', grad_fn=<TopkBackward>)

In [5]:
l1 = [1,2,3,4]
l2 = [5,6,7,8]
for a, b in zip(l1, l2):
    print('{} and {}'.format(a, b))

1 and 5
2 and 6
3 and 7
4 and 8


In [98]:
x = tokenizer.encode('India')
out, hidden = model(torch.tensor([x.ids], device=device), hidden)
topk_v, topk_i = out.topk(1, dim=2)
topk_i

tensor([[[0]]], device='cuda:0')

In [10]:
torch.save(model, './models/test_overfit.tar')

NameError: name 'torch' is not defined