# Pretraining model
Our model right now generates random tokens. We need to pretrain it on a corpus of text. Lets's take the model we built:

In [10]:
import sys
sys.path.append('../')
from gpt_model.model import GPT
import torch

GPT_CONFIG = {
    "vocab_size": 50304,
    "n_embd": 768,
    "n_heads": 12,
    "n_layers": 12,
    "dropout": 0.1,
    "context_length": 256
}

torch.manual_seed(42)
model = GPT(GPT_CONFIG)
model.eval()


GPT(
  (tok_emb): Embedding(50304, 768)
  (pos_emb): Embedding(256, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (query): Linear(in_features=768, out_features=768, bias=False)
        (key): Linear(in_features=768, out_features=768, bias=False)
        (value): Linear(in_features=768, out_features=768, bias=False)
        (output): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln1): LayerNorm()
      (ln2): LayerNorm()
      (ff): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (query): Linear(in_features=768, out_features=7

In [15]:
def generate_text_simple(model, input_tensor, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    idx = input_tensor
    for _ in range(max_new_tokens):
        
        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]
        
        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)
        
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]  

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [16]:
import tiktoken

def text_to_tokens(text, tokenizer):
    encoding = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoding_tensor = torch.tensor(encoding).unsqueeze(0)
    return encoding_tensor

def tokens_to_text(tokens, tokenizer):
    text = tokenizer.decode(tokens.squeeze(0).tolist())
    return text

tokenizer = tiktoken.get_encoding("gpt2")

text = "Every efforte moves you"
token_ids = generate_text_simple(model, text_to_tokens(text, tokenizer), max_new_tokens=10, context_size=GPT_CONFIG["context_length"])

print(tokens_to_text(token_ids, tokenizer))


Every efforte moves youatech inevitableLate arrestedAPTER stay Joy bountyateur likes


Let's try to train the model on a corpus of text. We'll use the 'La La Land' script.

In [17]:
with open("../data/la_la_land.txt", "r", encoding="utf-8") as f:
    text = f.read()

print(text[:99])

LA LA LAND
by
Damien Chazelle
FADE IN...
A sun-blasted sky. We HEAR radios -- one piece of music af


In [18]:
print("Total characters: ", len(text))
total_tokens = len(tokenizer.encode(text))
print("Total tokens: ", total_tokens)

Total characters:  102179
Total tokens:  32921


In [19]:
from dataloader import create_dataloader_v1

# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text))
train_data = text[:split_idx]
val_data = text[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG["context_length"],
    stride=GPT_CONFIG["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG["context_length"],
    stride=GPT_CONFIG["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [21]:
# Sanity check

if total_tokens * (train_ratio) < GPT_CONFIG["context_length"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG['context_length']` or "
          "decrease the `training_ratio`")