- setuptools, requirements
- separate model and generation
- interface for generation/chat
- eos tokens
- google style guide

In [None]:
import torch
torch.manual_seed(1337)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

context_length = 256
embed_size = 512
num_layers = 6
num_heads = 8
forward_expansion = 4
dropout = 0.2

In [None]:
with open('datasets/shakespeare.txt') as file:
    text = file.read()

In [None]:
from tokenizer import bpe_tokenizer
from pipelines import text_to_tensor

text = text[:10000]

tokenizer = bpe_tokenizer.BytePairEncodingTokenizer.read_pkl('./tokenizer/trained_tokenizers/bpe.pkl')

pipeline = text_to_tensor.create_pipeline(tokenizer, 0.9)

train_data, test_data = pipeline.transform(text)

In [None]:
from language_model import generation

model = generation.LanguageModel(
    tokenizer,
    embed_size,
    context_length,
    num_layers,
    num_heads,
    forward_expansion,
    dropout,
    device
)

print(sum([p.numel() for p in model.encoder.parameters()]) / 1e6, 'M parameters')

In [None]:
model.predict(' ', max_new_tokens=100)

In [None]:
from language_model import train

args = train.TrainArgs()

trainer = train.ModelTrainer(args, model, train_data, test_data)

trainer.train()

In [None]:
model.predict(' ', 100)