In [1]:
with open('datasets/shakespeare.txt') as file:
    text = file.read()

In [2]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
from tokenizer.bpe_tokenizer import BytePairEncodingTokenizer
from pipelines.text_to_tensor import create_pipeline

text = text[:1000]

tokenizer = BytePairEncodingTokenizer(100)

tokenizer.fit([text])

print(tokenizer.vocab_size)

pipeline = create_pipeline(tokenizer, 0.9)

train_data, test_data = pipeline.transform(text)

195


In [4]:
from enumerations.data_splits import DataSplits

In [5]:
from typing import Tuple

import torch

context_length = 64
batch_size = 64

def get_batch(split: DataSplits, batch_size: int, context_length: int, device: str = 'cpu') -> Tuple[torch.tensor, torch.tensor]:
    data = train_data if split == DataSplits.TRAIN else test_data

    print(split == DataSplits.TEST)

    assert len(data) - context_length >= 0, 'Length of data is shorter than context_length'

    idx = torch.randint(len(data) - context_length, (batch_size, ))
    x = torch.stack([data[i:i + context_length] for i in idx])
    y = torch.stack([data[i + 1:i + context_length + 1] for i in idx])
    x, y = x.to(device), y.to(device)
    return x, y

In [7]:
from language_model.model import LanguageModel

embed_size = 512
num_layers = 6
num_heads = 8
forward_expansion = 4
dropout = 0

model = LanguageModel(
    tokenizer,
    embed_size,
    context_length,
    num_layers,
    num_heads,
    forward_expansion,
    dropout,
    device
)

print(sum([p.numel() for p in model.encoder.parameters()]) / 1e6, 'M parameters')

19.138755 M parameters


In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in [DataSplits.TRAIN, DataSplits.TEST]:
        losses = torch.zeros(50)
        for k in range(50):
            x, y = get_batch(split, batch_size, context_length)
            logits, loss = model(x, y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.05)

for iter in range(50):
    if iter % 5 == 0 or iter == 49:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses[DataSplits.TRAIN]:.4f}, test loss {losses[DataSplits.TEST]:.4f}")

    x_batch, y_batch = get_batch(DataSplits.TRAIN, batch_size, context_length)

    logits, loss = model(x_batch, y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

context = torch.zeros((1,1), dtype=torch.long)
print(tokenizer.decode(model.generate(context, max_new_tokens=500)[0].tolist()))

In [None]:
from language_model.model import LanguageModel

embed_size = 512
num_layers = 6
num_heads = 8
forward_expansion = 4
dropout = 0

model = LanguageModel(
    tokenizer,
    embed_size,
    context_length,
    num_layers,
    num_heads,
    forward_expansion,
    dropout
)