### Install and import library

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0


In [None]:
import pandas as pd
import json
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from transformers import GPT2Model
import torch.nn as nn

### Preprocessing

In [None]:
def preprocess(file_csv, tokenizer):
    df = pd.read_csv(file_csv)
    datasets = []

    ingre = [' '.join(text for text in json.loads(df['ingredients'][i])) for i in range(len(df['ingredients']))]
    direc = [' '.join(text for text in json.loads(df['directions'][i])) for i in range(len(df['directions']))]

    texts = [ingre[i] + '\nIntructions: ' + direc[i] for i in range(len(ingre))]

    [datasets.append(torch.tensor(tokenizer.encode(text))) for text in texts]

    return datasets


def pad_and_truncate_sequences(sequences, max_length, padding_value=50256):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < max_length:
            padding = [padding_value] * (max_length - len(seq))
            padded_seq = torch.cat((seq, torch.tensor(padding)))
            padded_sequences.append(padded_seq)
        else:

            padded_sequences.append(seq[:max_length])

    padded_sequences = torch.stack(padded_sequences)

    return padded_sequences

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [None]:
class RecipeDataset(Dataset):
    def __init__(self, inp, out):
        self.inp = inp
        self.tar = out

    def __len__(self):
        return len(self.inp)

    def __getitem__(self, idx):
        return self.inp[idx], self.tar[idx]

### Multi-head Attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by n_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        # Reduce the projection dim to match desired output dim
        self.head_dim = d_out // num_heads

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        # Linear layer to combine head outputs
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(
            torch.ones(block_size, block_size), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        # Dot product for each head
        attn_scores = queries @ keys.transpose(2, 3)

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.reshape(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec

### Layer normalize + Feed forward (MLP)

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
            nn.Dropout(cfg["drop_rate"])
        )

    def forward(self, x):
        return self.layers(x)

### GPT2 Block

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            block_size=cfg["seq_len"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_resid = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_resid(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_resid(x)
        x = x + shortcut  # Add the original input back

        return x

### GPT2 Model

In [None]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["seq_len"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

### Load GPT2 weights

In [None]:
def assign_check(left, right):
    if left.shape != right.shape:
        raise ValueError(
            f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

import numpy as np


def load_weights(gpt, gpt_hf):

    d = gpt_hf.state_dict()

    gpt.pos_emb.weight = assign_check(gpt.pos_emb.weight, d["wpe.weight"])
    gpt.tok_emb.weight = assign_check(gpt.tok_emb.weight, d["wte.weight"])

    for b in range(BASE_CONFIG["n_layers"]):
        q_w, k_w, v_w = np.split(d[f"h.{b}.attn.c_attn.weight"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign_check(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign_check(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign_check(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(d[f"h.{b}.attn.c_attn.bias"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign_check(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign_check(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign_check(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].ff.layers[0].weight = assign_check(
            gpt.trf_blocks[b].ff.layers[0].weight, d[f"h.{b}.mlp.c_fc.weight"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign_check(
            gpt.trf_blocks[b].ff.layers[0].bias, d[f"h.{b}.mlp.c_fc.bias"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign_check(
            gpt.trf_blocks[b].ff.layers[2].weight, d[f"h.{b}.mlp.c_proj.weight"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign_check(
            gpt.trf_blocks[b].ff.layers[2].bias, d[f"h.{b}.mlp.c_proj.bias"])

        gpt.trf_blocks[b].norm1.scale = assign_check(
            gpt.trf_blocks[b].norm1.scale, d[f"h.{b}.ln_1.weight"])
        gpt.trf_blocks[b].norm1.shift = assign_check(
            gpt.trf_blocks[b].norm1.shift, d[f"h.{b}.ln_1.bias"])
        gpt.trf_blocks[b].norm2.scale = assign_check(
            gpt.trf_blocks[b].norm2.scale, d[f"h.{b}.ln_2.weight"])
        gpt.trf_blocks[b].norm2.shift = assign_check(
            gpt.trf_blocks[b].norm2.shift, d[f"h.{b}.ln_2.bias"])

        gpt.trf_blocks[b].att.out_proj.weight = assign_check(
            gpt.trf_blocks[b].att.out_proj.weight, d[f"h.{b}.attn.c_proj.weight"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign_check(
            gpt.trf_blocks[b].att.out_proj.bias, d[f"h.{b}.attn.c_proj.bias"])

    gpt.final_norm.scale = assign_check(
        gpt.final_norm.scale, d[f"ln_f.weight"])
    gpt.final_norm.shift = assign_check(
        gpt.final_norm.shift, d[f"ln_f.bias"])
    gpt.out_head.weight = assign_check(
        gpt.out_head.weight, d["wte.weight"])

### Loss

In [None]:
def calc_loss_batch(input_batch, target_batch, model, device):
    mask = input_batch.view(-1) != 50256
    mask = torch.tensor(mask)
    mask = mask.to(device)

    input_batch, target_batch = input_batch.to(device), target_batch.to(device)

    logits = model(input_batch)
    logits = logits.view(-1, logits.size(-1))
    loss = torch.nn.functional.cross_entropy(logits, target_batch.view(-1), reduction = 'none')

    loss = loss*mask
    loss = torch.sum(loss)/torch.sum(mask)

    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss, batches_seen = 0., 0.
    if num_batches is None:
        num_batches = len(data_loader)
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
            batches_seen += 1
        else:
            break
    return total_loss / batches_seen

### Train model

In [None]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen = 0
    global_step = -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()  # Reset loss gradients from previous epoch
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()  # Calculate loss gradients
            optimizer.step()  # Update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:08d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        print('hee')
        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

        # torch.save(model.state_dict(), "weights_model-gpt-medium-_best_ver2-ep{}.pth".format(epoch+1))
        # print('complete save model - epoch {}'.format(epoch + 1))
    return train_losses, val_losses, track_tokens_seen

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(
            train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(
            val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):

        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)

        # Focus only on the last time step
        # (batch, n_token, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]

        # Get the idx of the vocab entry with the highest logits value
        idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=60, context_size=context_size
        )
        decoded_text = token_ids_to_text(token_ids, tokenizer)
        print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [None]:
file_csv = '/content/mini_RecipeNLG_2k5.csv'
tokenizer = tiktoken.get_encoding('gpt2')
max_length = 1025
datasets = preprocess(file_csv, tokenizer)
padded_and_truncated = pad_and_truncate_sequences(datasets, max_length)

# print(len(datasets))
# print(datasets[0].shape)
# print(datasets[1].shape)
# print(datasets[2].shape)
# print(padded_and_truncated.shape)

# Split datasets to trainData ValData
train_ratio = 0.9
indices = int(train_ratio*len(datasets))

train_data, val_data = padded_and_truncated[:indices], padded_and_truncated[indices:]

train_input, train_output = train_data[:, :-1], train_data[:, 1: ]
val_input, val_output = val_data[:, :-1], val_data[:, 1: ]

trainData = RecipeDataset(train_input, train_output)
valData = RecipeDataset(val_input, val_output)

trainloader = DataLoader(trainData, batch_size=4, shuffle=True, drop_last=False)
valloader = DataLoader(valData, batch_size=4, shuffle=True, drop_last=False)

In [None]:
model_names = {
    "gpt2-small": "openai-community/gpt2",         # 124M
    "gpt2-medium": "openai-community/gpt2-medium",  # 355M
    "gpt2-large": "openai-community/gpt2-large",   # 774M
    "gpt2-xl": "openai-community/gpt2-xl"          # 1558M
}

CHOOSE_MODEL = "gpt2-small"

gpt_hf = GPT2Model.from_pretrained(
    model_names[CHOOSE_MODEL], cache_dir="checkpoints")
gpt_hf.eval()

BASE_CONFIG = {
    "vocab_size": 50257,  # Vocabulary size
    "seq_len": 1024,      # Context length
    "drop_rate": 0.2,     # Dropout rate
    "qkv_bias": True      # Query-key-value bias
}

model_configs = {
    "gpt2-small": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPTModel(BASE_CONFIG)
load_weights(model, gpt_hf)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

  return torch.nn.Parameter(torch.tensor(right))


In [None]:
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)

num_epochs = 250

text_test = "1 c. firmly packed brown sugar 1/2 c. evaporated milk 1/2 tsp. vanilla 1/2 c. broken nuts (pecans) 2 Tbsp. butter or margarine 3 1/2 c. bite size shredded rice biscuits\nInstruction: "
train_losses, val_losses, tokens_seen = train_model_simple(
    model, trainloader, valloader, optimizer, device,
    num_epochs=num_epochs, eval_freq=1, eval_iter=2,
    start_context= text_test,
)

  mask = torch.tensor(mask)


Ep 1 (Step 00000000): Train loss 4.816, Val loss 4.487
Ep 1 (Step 00000001): Train loss 4.886, Val loss 5.053
Ep 1 (Step 00000002): Train loss 3.746, Val loss 3.698
Ep 1 (Step 00000003): Train loss 3.907, Val loss 3.938
Ep 1 (Step 00000004): Train loss 3.818, Val loss 3.548
Ep 1 (Step 00000005): Train loss 3.693, Val loss 3.915
Ep 1 (Step 00000006): Train loss 3.430, Val loss 3.675
Ep 1 (Step 00000007): Train loss 3.356, Val loss 3.522
Ep 1 (Step 00000008): Train loss 3.179, Val loss 3.500
Ep 1 (Step 00000009): Train loss 3.293, Val loss 3.455
Ep 1 (Step 00000010): Train loss 3.050, Val loss 3.628
Ep 1 (Step 00000011): Train loss 3.506, Val loss 3.475
Ep 1 (Step 00000012): Train loss 3.502, Val loss 3.118
Ep 1 (Step 00000013): Train loss 3.284, Val loss 3.160
Ep 1 (Step 00000014): Train loss 3.117, Val loss 3.179
Ep 1 (Step 00000015): Train loss 3.327, Val loss 3.321
Ep 1 (Step 00000016): Train loss 3.183, Val loss 3.118
Ep 1 (Step 00000017): Train loss 2.967, Val loss 3.030
Ep 1 (Step

NameError: name 'generate_text_simple' is not defined