In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import torch
from torch.utils.data import DataLoader
from tokenizers import SentencePieceBPETokenizer, Tokenizer

from llama.data_pipeline import gutenberg_data, dataset
from llama.model.tokenizer import CharacterTokenizer
from llama.model.custom_layers import *
from llama.model.custom_blocks import *
from llama.model import model, training
from llama.constants import *


device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# Configuration object for model parameters
CONFIG = {
    'vocab_size': -1,        # TBD based on dataset and tokenizer
    'batch_size': 128,       # Number of batches to be processed at each random split
    'epochs': 5,             # Number of training epochs
    'context_window': 32,    # Number of characters in each input (x) and target (y) sequence of each batch
    'd_model': 128,          # Dimension of linear layers (128)
    'n_heads': 8,            # number of attention heads
    'n_layers': 4,           # Set the number of layers to 4
}

experiment_dir = os.path.join(EXPERIMENTS_DIR, 'sentencepiece_llama_gutenberg')
os.makedirs(experiment_dir, exist_ok=True)

# Load dataset and fit tokenizer

In [None]:
data_dir = gutenberg_data.how_to_get_gutenberg_subset()

In [None]:
gb_data, df_metadata = gutenberg_data.load_gutenberg_data(data_dir)

In [None]:
all_text = "".join(list(gb_data.values()))
print(f"Number of characters in all text: {len(all_text):,}")

In [None]:
# split the text data and take a subset (cause the data is quite big)
train_split = [gb_data[fn] for fn in df_metadata.groupby('author')['id'].sample(1, random_state=42)]
val_split = [gb_data[fn] for fn in df_metadata['id'].sample(20, random_state=42)]
test_split = [gb_data[fn] for fn in df_metadata['id'].sample(20, random_state=43)]
print(f"Number of characters in all train: {sum([len(text) for text in train_split]):,}")
print(f"Number of characters in all val/test: {sum([len(text) for text in val_split]):,} / {sum([len(text) for text in test_split]):,}")

In [None]:
%%time

tokenizer_path = os.path.join(experiment_dir, 'sentencepiece_tokenizer.json')

if os.path.isfile(tokenizer_path):
    # load the tokenizer from disk
    tokenizer = Tokenizer.from_file(tokenizer_path)
    print(f"Loaded tokenizer from disk ('{tokenizer_path}')")
else:
    # train the tokenizer from scratch with a subset of the text data for speed
    unk_token = '<unk>'
    tokenizer = SentencePieceBPETokenizer(unk_token=unk_token)
    tokenizer.train_from_iterator(
        train_split,
        vocab_size=3000,
        min_frequency=10,
        show_progress=True,
        special_tokens=[unk_token, '\n']
    )
    tokenizer.save(tokenizer_path)

In [None]:
# update the vocabulary size in the configuration
CONFIG['vocab_size'] = tokenizer.get_vocab_size()

# Output the total number of characters in our dataset (Vocabulary Size)
print(f'Total number of tokens our tokenizer supports: {CONFIG["vocab_size"]}')

In [None]:
encoded = tokenizer.encode('Hello world, how are you doing.\nWhat are you doing?\nWho are you with?')
print(tokenizer.decode(encoded.ids, skip_special_tokens=False))
print(encoded.ids)

In [None]:
# internally, the tokenizer applies the following steps when mapping text to indices
print(tokenizer.normalizer)
print(tokenizer.pre_tokenizer)
print(tokenizer.model)
print(tokenizer.post_processor)

# Create pytorch dataset and dataloader

In [None]:
# create a dataset for each split
dtype = torch.int16
train_dataset = dataset.TextDataset(train_split, tokenizer, CONFIG['context_window'], device, dtype)
val_dataset = dataset.TextDataset(val_split, tokenizer, CONFIG['context_window'], device, dtype)
test_dataset = dataset.TextDataset(test_split, tokenizer, CONFIG['context_window'], device, dtype)

# create a dataloader for each split
bs = CONFIG['batch_size']
train_dataloader = DataLoader(train_dataset, batch_size=bs, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=bs, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=bs, shuffle=False)

print(f"Steps train: {len(train_dataloader)}, val: {len(val_dataloader)}, test: {len(test_dataloader)}")

# Create model and train

In [None]:
# create the Llama model
llama = model.Llama(CONFIG)
llama = llama.to(device)
model.print_model_parameters(llama)

# create the corresponding optimizer
optimizer = torch.optim.Adam(llama.parameters(), lr=1e-3)

# create a step learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

In [None]:
%%time

df_losses = training.train(llama, optimizer, train_dataloader, val_dataloader, CONFIG['epochs'], lr_scheduler)

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(14, 4))

df_losses[['train', 'val']].plot(ax=ax1)
df_losses[['lr']].plot(ax=ax2);

In [None]:
%%time

# check loss on test split
training.evaluate_loss(llama, test_dataloader)

In [None]:
# Generate text using the trained LLM (llama) with a maximum of 500 tokens
generated_text = llama.generate(device, tokenizer, {'skip_special_tokens': False}, max_new_tokens=500)
# since the tokenizer always adds a whitespace before the start of a word, we want to remove the whitespace before the first word of a sentence
print(generated_text.replace('\n ', '\n'))

# Save the model

In [None]:
# save the entire model
torch.save(llama, os.path.join(experiment_dir, 'llama.pth'))

# save only the model parameters
# torch.save(llama.state_dict(), os.path.join(experiment_dir, 'llama_model_parameters.pth'))

In [None]:
# check loaded model
llama_loaded = torch.load(os.path.join(experiment_dir, 'llama.pth'))

print(llama_loaded.generate(device, tokenizer, {'skip_special_tokens': False}, max_new_tokens=500).replace('\n ', '\n'))