In [1]:
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer
from tqdm.notebook import tqdm

In [4]:
with open("sherlock.txt", encoding='UTF-8') as fin:
    text = fin.read()
    tok = Tokenizer.from_pretrained("bert-base-cased")
    sherlock_tokens = tok.encode(text).tokens

In [5]:
len(sherlock_tokens)

136135

Having a class to handle the vocabulary will make it convenient for us!

In [6]:
class Vocab:
    def __init__(self, tokens):
        self.vocab = [tok for tok, count in Counter(tokens).most_common()]
        self.tok2idx = {tok: idx + 1 for idx, tok in enumerate(self.vocab)}
        self.tok2idx[0] = "[UNK]"
        self.idx2tok = {idx: tok for tok, idx in self.tok2idx.items()}
    
    def __len__(self):
        return len(self.tok2idx)
    
    def to_id(self, tok):
        return self.tok2idx.get(tok, 0)

    def to_tok(self, id):
        return self.idx2tok.get(id, "[UNK]")


In [7]:
vocab = Vocab(sherlock_tokens)

In [11]:

def ngrams(tokens, n):
    return [tokens[i:i + n] for i in range(len(tokens) - n + 1)]

# tokens = ngrams(sherlock_tokens, 3)

A Dataset class will let us easily batch the data with a DataLoader. You need to override two methods: `__len__` (for `len(data)`)and `__getitem__` (for `data[idx]`). We will make a bigram language model, so the input is one token and the output is also one token.

In [12]:
class LMDataset(Dataset):
    def __init__(self, tokens):
        self.data = []
        for gram in ngrams(tokens, 2):
            x = torch.LongTensor([vocab.to_id(gram[0])])  # (1)

            y = torch.LongTensor([vocab.to_id(gram[1])])  # (1)
            y = torch.nn.functional.one_hot(y, len(vocab))  # (1, V); need onehot to compute softmax
            y = y.float().squeeze(0)  # (V); float so we can compute loss (V); squeeze removes dimensions of size 1
            
            self.data.append((x, y))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [13]:
# split the data into train and dev
train_part = int(len(sherlock_tokens) * 0.8)
train_data = LMDataset(sherlock_tokens[:train_part])
dev_data = LMDataset(sherlock_tokens[train_part:])

In [14]:
train_data[0]

(tensor([5166]), tensor([0., 0., 0.,  ..., 0., 0., 0.]))

In [15]:
torch.nn.functional.one_hot(torch.LongTensor([5]), 10)

tensor([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]])

In [16]:
torch.argmax(train_data[1][1])

tensor(5167)

In [17]:
vocab.to_tok(5167)

'Adventures'

A DataLoader lets us easily create minibatches of the data. Computing gradients on a minibatch is usually better than
- a single point (can jump around randomly)
- the whole data (computationally intensive)

Notice that when iterating through a dataloader, the batch size is the first dimension.

In [18]:
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_data, batch_size=16, shuffle=True)

In [19]:
for x, gold in train_data:
    print(x)
    print(x.shape)
    print(gold)
    print(gold.shape)
    break

tensor([5166])
torch.Size([1])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
torch.Size([7964])


**Your task**: try removing `.squeeze(0)` when creating the dataset, then run the above cell to check the shape of the tensors

In [None]:
class FFLLM(torch.nn.Module):
    def __init__(self, voc_size):
        super().__init__()
        self.emb = torch.nn.Embedding(voc_size, 300)
        self.linear1 = torch.nn.Linear(300, 300) 
        self.linear2 = torch.nn.Linear(300, voc_size)
        
    def forward(self, x):
        e = self.emb(x)
        e = e.squeeze()  # Your task: figure out why this is necessary
        h = self.linear1(e)
        y = self.linear2(h)
        return y #torch.softmax(y, dim=0)

In [21]:
x

tensor([5166])

In [22]:
emb = torch.nn.Embedding(len(vocab), 300)
linear1 = torch.nn.Linear(300, 300)
linear2 = torch.nn.Linear(300, len(vocab))

In [23]:
x.shape

torch.Size([1])

In [24]:
e = emb(x)
print(e.shape)
e = e.squeeze()
e.shape

torch.Size([1, 300])


torch.Size([300])

In [None]:
h = linear1(e)
h.shape

torch.Size([300])

In [28]:
y = linear2(h)
y.shape

torch.Size([7964])

In [29]:
y2 = torch.softmax(y, dim=0)
y2.shape

torch.Size([7964])

In [31]:
y

tensor([-0.1338,  0.2139, -0.3422,  ..., -0.3007,  0.0190, -0.1577],
       grad_fn=<ViewBackward0>)

In [30]:
model = FFLLM(len(vocab))
loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters())

In [1]:
loss_func(gold, y2)

NameError: name 'loss_func' is not defined

In [33]:
with torch.no_grad():
    for x, y in train_data:
        print(model(x))
        print(y)
        break

tensor([ 0.2977,  0.3314, -0.7737,  ...,  0.0036, -0.0572,  0.2371])
tensor([0., 0., 0.,  ..., 0., 0., 0.])


In [35]:
for epoch in range(1):
    print("Epoch", epoch)
    for x, y in tqdm(train_data):
        model.zero_grad()
        pred = model(x)
        loss = loss_func(pred, y)
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        total_loss = 0
        for x, y in train_data:
            pred = model(x)
            loss = loss_func(pred, y)
            total_loss += loss
        print("train loss:", total_loss / len(train_loader))

        total_loss = 0
        for x, y in dev_data:
            pred = model(x)
            loss = loss_func(pred, y)
            total_loss += loss
        print("dev loss:", total_loss / len(dev_loader))

Epoch 0


  0%|          | 0/108907 [00:00<?, ?it/s]

train loss: tensor(88.0294)
dev loss: tensor(90.5074)


*Your tasks*:
- make this model a trigram language model (i.e. take two inputs). Look into the `torch.cat` function. You will have to modify several different parts of this notebook
- use a data loader, which handles batching
- you will need to modify the below function as well

In [36]:
with torch.no_grad():
    result = ["A"]

    for i in range(20):
        idx = vocab.to_id(result[-1])
        pred = model(torch.LongTensor([[idx]]))
        next_idx = torch.argmax(pred).item
        tok = vocab.to_tok(idx)
        result.append(tok)

In [37]:
result

['A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A']