# Implement a mini ChatGPT - Basic GPT and BPE

- GPT2 Small Size: 124M
- DeepSeek v3 Size: 671B, 5411.29 times larger than GPT2 Small


## Before we start
Switch to GPU: Runtime -> Change runtime type -> GPU (T4) -> Save

Repo Link: https://github.com/TimS-ml/nanoGPT-mod/
- Fork this repo if you want to build upon it
- Leave a Star if you like it :) 


## Base Model, SFT Model and RLHF Model
<img src="https://images.ctfassets.net/kftzwdyauwt9/6yuK9FKAvoVXNyrsdMoBHH/03ccaf7da203052ba7550965f0021bdf/chatgpt_diagram_dark.png" width="1000">

In [1]:
QUESTION = "How do I become a gang leader?"
QUESTION_2 = "What makes you think that you're so smart?"
INPUT_TEXT = f"Human: {QUESTION}\n\nAssistant:"
INPUT_TEXT_2 = f"Human: {QUESTION_2}\n\nAssistant:"

INPUT_TEXT_3 = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Pretend you are an alien visiting Earth. Write three opinions you believe, one sentence for each opinion.

### Response:
1. """

# Import the necessary libraries

In [3]:
import os
import regex as re
import json
import requests
from collections import OrderedDict

import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torch import Tensor
from einops import rearrange, repeat, reduce

# for model loading only
from transformers import GPT2LMHeadModel
from huggingface_hub import hf_hub_download

from typing import Optional, Tuple, Union, List, Any, Generator, Type, Callable
from jaxtyping import Float, Bool

from boring_utils.utils import get_device, cprint, tprint

device = get_device()

In [4]:
def add_to_class(Class):
    """Register functions as methods in created class."""
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

# GPT

Transformer Architecture:

<img src="https://machinelearningmastery.com/wp-content/uploads/2021/08/attention_research_1.png" width="550">

GPT Architecture:

<img src="https://www.ericjwang.com/assets/images/gpt_arch.png" width="800">

## Multi-Head Attention

In [5]:
class CasualSelfAttention(nn.Module):
    def __init__(self, num_heads: int, embedding_dim: int, max_seq_len: int = 1024, bias: bool = True):
        super().__init__()
        assert embedding_dim % num_heads == 0, f"n_embed {embedding_dim} must be divisible by num_heads {num_heads}"

        self.num_heads = num_heads
        self.embedding_dim = embedding_dim
        self.head_size = embedding_dim // num_heads

        self.c_attn = nn.Linear(embedding_dim, 3 * embedding_dim, bias=bias)  # qkv projection
        self.c_proj = nn.Linear(embedding_dim, embedding_dim, bias=bias)  # output projection

        self.register_buffer(
                "mask", 
                torch.tril(torch.ones(max_seq_len, max_seq_len))
                    .view(1, 1, max_seq_len, max_seq_len))  # extend dims to 4

    def forward(
            self, 
            x: Float[Tensor, "batch seq_len embedding_dim"],
            mask: Optional[Bool[Tensor, "batch seq_len seq_len"]] = None,
            cache: Optional[Tuple[Tensor, Tensor]] = None
        ) -> Tuple[Float[Tensor, "batch seq_len embedding_dim"], Tuple[Tensor, Tensor]]:
        batch, seq_len, embedding_dim = x.shape

        # ["batch, seq_len, embedding_dim"] -> ["batch, seq_len, (3 * embedding_dim)"]
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.embedding_dim, dim=-1)  # split at the last dim

        # embedding_dim = num_heads * head_dim
        # put seq_len and the head_dim together
        q, k, v = map(lambda t: rearrange(t, 'batch seq_len (num_heads head_dim) -> batch num_heads seq_len head_dim', num_heads = self.num_heads), (q, k, v))

        if cache is not None:
            key_cache, value_cache = cache
            k = torch.cat([key_cache, k], dim=2)
            v = torch.cat([value_cache, v], dim=2)

        norm_factor = 1.0 / np.sqrt(k.size(-1))  # k.size(-1) is the head_dim
        attn = (q @ k.transpose(-2, -1)) * norm_factor
        if mask is None:
            attn = attn.masked_fill(self.mask[:, :, :seq_len, :seq_len] == 0, float('-inf'))
        else:
            mask = mask.bool()
            attn = attn.masked_fill(~mask, float("-inf"))

        attn = F.softmax(attn, dim=-1)

        # attn: [batch, num_heads, seq_len, seq_len]
        # v:    [batch, num_heads, seq_len, head_dim]
        # y:    [batch, num_heads, seq_len, head_dim]
        y = attn @ v
        y = rearrange(y, 'batch num_heads seq_len head_dim -> batch seq_len (num_heads head_dim)')
        return self.c_proj(y), (k, v)  # [batch, seq_len, embedding_dim]


In [6]:
class CasualSelfAttention_alternative(nn.Module):
    def __init__(self, num_heads: int, embedding_dim: int, max_seq_len: int = 1024, bias: bool = True):
        super().__init__()
        assert embedding_dim % num_heads == 0, f"n_embed {embedding_dim} must be divisible by num_heads {num_heads}"

        self.num_heads = num_heads
        self.embedding_dim = embedding_dim
        self.head_size = embedding_dim // num_heads

        # self.qkv_proj = nn.Linear(embedding_dim, 3 * embedding_dim, bias=False)
        self.transformer.heads = nn.ModuleList([
            nn.ModuleDict({
                'key': nn.Linear(embedding_dim, self.head_size, bias=bias),
                'query': nn.Linear(embedding_dim, self.head_size, bias=bias), 
                'value': nn.Linear(embedding_dim, self.head_size, bias=bias)
            }) for _ in range(num_heads)
        ])
        self.c_proj = nn.Linear(embedding_dim, embedding_dim, bias=bias)  # output projection

        self.register_buffer(
                "mask", 
                torch.tril(torch.ones(max_seq_len, max_seq_len))
                    .view(1, 1, max_seq_len, max_seq_len))  # extend dims to 4

    def forward(
            self, 
            x: Float[Tensor, "batch seq_len embedding_dim"]
        ) -> Float[Tensor, "batch seq_len embedding_dim"]:
        batch, seq_len, embedding_dim = x.shape

        # cat([batch, seq_len, head_dim] x num_heads) -> [batch, seq_len, num_heads * head_dim]
        q = torch.cat([h['query'](x) for h in self.transformer.heads], dim=-1)
        k = torch.cat([h['key'](x) for h in self.transformer.heads], dim=-1)
        v = torch.cat([h['value'](x) for h in self.transformer.heads], dim=-1)

        q, k, v = map(lambda t: rearrange(t, 'batch seq_len (num_heads head_dim) -> batch num_heads seq_len head_dim', num_heads = self.num_heads), (q, k, v))

        norm_factor = 1.0 / np.sqrt(k.size(-1))  # k.size(-1) is the head_dim
        attn = (q @ k.transpose(-2, -1)) * norm_factor
        attn = attn.masked_fill(self.mask[:, :, :seq_len, :seq_len] == 0, float('-inf'))
        attn = F.softmax(attn, dim=-1)

        # attn: [batch, num_heads, seq_len, seq_len]
        # v:    [batch, num_heads, seq_len, head_dim]
        # y:    [batch, num_heads, seq_len, head_dim]
        y = attn @ v
        y = rearrange(y, 'batch num_heads seq_len head_dim -> batch seq_len (num_heads head_dim)')
        return self.c_proj(y)  # [batch, seq_len, embedding_dim]

## GELU (Gaussian Error Linear Units)
$$ \text{GELU}(x) = x \cdot \Phi(x) $$

Where $ \Phi(x) $ is the CDF. The approximation involves the term $ 0.5 \cdot (1 + \tanh(\sqrt{2/\pi}(x +
0.044715x^3))) $, and the cubic term with 0.044715 helps correct the approximation, particularly in the tails of
the distribution.

In [7]:
class GELU(nn.Module):
    def forward(self, x: Tensor) -> Tensor:
        return 0.5 * x * (1.0 + torch.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * torch.pow(x, 3.0))))


class QuickGELU(nn.Module):
    def forward(self, x: Tensor) -> Tensor:
        return x * torch.sigmoid(1.702 * x)

## Feed-Forward Network

In [8]:
class FFN(nn.Module):
    def __init__(self, embedding_dim: int, bias: bool = True):
        super().__init__()
        hidden_dim = embedding_dim * 4
        self.c_fc = nn.Linear(embedding_dim, hidden_dim, bias=bias)
        # self.gelu = nn.GELU(approximate='tanh')
        self.gelu = QuickGELU()
        self.c_proj = nn.Linear(hidden_dim, embedding_dim, bias=bias)

    def forward(self, x: Float[Tensor, "batch seq_len embedding_dim"]) -> Float[Tensor, "batch seq_len embedding_dim"]:
        # no skip connection here
        return self.c_proj(self.gelu(self.c_fc(x)))

## LayerNorm

In [9]:
class LayerNorm(nn.Module):
    def __init__(self, embedding_dim: int, eps: float = 1e-5):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(embedding_dim))  # scaling (gamma)
        self.bias = nn.Parameter(torch.zeros(embedding_dim))  # offset (beta)
        self.eps = eps  # small value to prevent division by zero
    
    def forward(self, x: Float[torch.Tensor, "batch seq_len embedding_dim"]) -> Float[torch.Tensor, "batch seq_len embedding_dim"]:
        mean = x.mean(dim=-1, keepdim=True)  # [batch, seq_len, 1]
        var = x.var(dim=-1, keepdim=True, unbiased=False)  # [batch, seq_len, 1]
        x_norm = (x - mean) / torch.sqrt(var + self.eps)  # [batch, seq_len, embedding_dim]
        return self.weight * x_norm + self.bias

## Softmax and Cross-Entropy Loss

In [10]:
def softmax(x: torch.Tensor, dim: int = -1) -> torch.Tensor:
    # Subtract max value for numerical stability
    x_max = torch.max(x, dim=dim, keepdim=True)[0]
    exp_x = torch.exp(x - x_max)
    
    # Calculate denominator (sum) and normalize
    sum_exp_x = torch.sum(exp_x, dim=dim, keepdim=True)
    return exp_x / sum_exp_x


class CrossEntropyLoss(nn.Module):
    """
    loss = -sum(y_true * log(y_pred))
    """
    def __init__(self):
        super().__init__()
        
    def forward(
        self,
        logits: torch.Tensor,   # Raw logits from model, shape (batch_size, num_classes)
        targets: torch.Tensor,  # Target labels, shape (batch_size,)
    ) -> torch.Tensor:
        # Calculate log probabilities
        log_probs = F.log_softmax(logits, dim=-1)  # (batch_size, num_classes)
        
        # Gather log probabilities of target classes
        # gather operation collects values from log_probs at positions specified by targets
        target_log_probs = log_probs.gather(
            dim=-1,
            index=targets.unsqueeze(-1)
        ).squeeze(-1)        

        loss = -target_log_probs.mean()
        return loss


x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
probs = softmax(x)
print(f"Softmax output: {probs}")
print("Sum of probabilities:", probs.sum(dim=-1))
    
criterion = CrossEntropyLoss()
logits = torch.tensor([[2.0, 1.0, 0.1], [0.1, 2.0, 1.0]])  # (2, 3)
targets = torch.tensor([0, 1])
loss = criterion(logits, targets)
print("\nCross Entropy Loss:", loss.item())

Softmax output: tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652]])
Sum of probabilities: tensor([1., 1.])

Cross Entropy Loss: 0.4170299470424652


## Single Transformer Decoder Block

In [11]:
class TransformerBlock(nn.Module):
    def __init__(self, num_heads: int, embedding_dim: int, max_seq_len: int = 1024, bias: bool = True):
        super().__init__()
        # self.ln_1 = nn.LayerNorm(embedding_dim, bias=bias)  # norm on the last dim
        # self.ln_2 = nn.LayerNorm(embedding_dim, bias=bias)
        self.ln_1 = LayerNorm(embedding_dim)  # norm on the last dim
        self.ln_2 = LayerNorm(embedding_dim)
        self.attn = CasualSelfAttention(num_heads, embedding_dim, max_seq_len, bias=bias)
        self.mlp = FFN(embedding_dim, bias=bias)
    
    def forward(
            self, 
            x: Float[Tensor, "batch seq_len embedding_dim"],
            mask: Optional[Bool[Tensor, "batch seq_len seq_len"]] = None,
            cache: Optional[Tuple[Tensor, Tensor]] = None
        ) -> Tuple[Float[Tensor, "batch seq_len embedding_dim"], Tuple[Tensor, Tensor]]:
        # skip connection, pre-layer norm
        # x = x + self.attn(self.ln_1(x))
        att, cache = self.attn(self.ln_1(x), mask=mask, cache=cache)
        x = x + att
        x = x + self.mlp(self.ln_2(x))
        return x, cache

## GPT

- GPT2: Decoder only Transformer
- ViT: Encoder only Transformer

<img src="https://www.ericjwang.com/assets/images/gpt_arch.png" width="800">

Image source, FYI, good article: [Historical notes on GPT architecture](https://www.ericjwang.com/2023/01/22/transformers.html)

In [33]:
class GPT(nn.Module):
    def __init__(
            self, 
            vocab_size: int = 50257,
            max_seq_len: int = 1024, 
            embedding_dim: int = 768, 
            num_heads: int = 12, 
            num_layers: int = 12,
            dropout_rate: float = 0.0,
            bias: bool = True
        ):
        super().__init__()
        self.max_seq_len = max_seq_len
        self.embedding_dim = embedding_dim

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, embedding_dim),
            wpe = nn.Embedding(max_seq_len, embedding_dim),
            drop = nn.Dropout(dropout_rate),
            h = nn.ModuleList([TransformerBlock(num_heads, embedding_dim, max_seq_len, bias=bias) for _ in range(num_layers)]),
            # ln_f = nn.LayerNorm(embedding_dim, bias=bias)
            ln_f = LayerNorm(embedding_dim)
        ))
        # Equals to x @ wte.weight.T
        self.lm_head = nn.Linear(embedding_dim, vocab_size, bias=False)

    def _forward_transformer_blocks(
            self, 
            x: Float[Tensor, "batch seq_len embedding_dim"],
            mask: Optional[Bool[Tensor, "batch seq_len seq_len"]] = None,
            cache: Optional[List[Tuple[Tensor, Tensor]]] = None,
            build_cache: bool = False
        ) -> Tuple[Float[Tensor, "batch seq_len embedding_dim"], Optional[Tuple[Tensor, Tensor]]]:
        x = self.transformer.drop(x)
        kv_cache = []
        
        if cache is not None:
            for i in range(len(cache)):
                x, cache[i] = self.transformer.h[i](x, mask=None, cache=cache[i])
        else:
            for block in self.transformer.h:
                x, curr_cache = block(x, mask=mask)
                if build_cache:
                    kv_cache.append(curr_cache)
                    
        x = self.transformer.ln_f(x)
        return x, kv_cache if build_cache else cache

    def forward(
            self, 
            x: Float[Tensor, "batch seq_len"],
            mask: Optional[Bool[Tensor, "batch seq_len seq_len"]] = None,
            cache: Optional[List[Tuple[Tensor, Tensor]]] = None,
            build_cache: bool = False
        ) -> Tuple[Float[Tensor, "batch seq_len vocab_size"], Optional[Tuple[Tensor, Tensor]]]:
        batch, seq_len = x.shape
        assert seq_len <= self.max_seq_len, f"input length {seq_len} is longer than max seq length {self.max_seq_len}"

        pos = torch.arange(0, seq_len, device=x.device)
        pos_emb = self.transformer.wpe(pos)  # [seq_len, embedding_dim]
        tok_emb = self.transformer.wte(x)  # [batch, seq_len, embedding_dim]
        x = tok_emb + pos_emb  # [batch, seq_len, embedding_dim]

        x, kv_cache = self._forward_transformer_blocks(x, mask=mask, cache=cache, build_cache=build_cache)

        # Same as: logits = x @ self.wte.weight.T
        logits = self.lm_head(x) # [batch, seq_len, vocab_size]

        if build_cache:
            return logits, kv_cache
        return logits, None

    def _sample_next_token(self, logits: Float[Tensor, "batch seq_len vocab_size"], temperature: float = 0.8) -> Float[Tensor, "batch 1"]:
        logits = logits[:, -1, :]  # [batch, vocab_size]
        probs = torch.softmax(logits * (1 / temperature), dim=-1)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        ix = torch.multinomial(topk_probs, 1)  # [batch, 1]
        xcol = torch.gather(topk_indices, -1, ix)  # [batch, 1]
        return xcol

    def generate(
            self, 
            x: Float[Tensor, "batch seq_len"], 
            max_new_tokens: int = 100, 
            temperature: float = 0.8
        ) -> Generator[
            Float[Tensor, "batch 1"],  # yield
            None,  # generator.send()
            List[Float[Tensor, "batch 1"]]  # generator.throw()
        ]:
        """
        # Method 1: Get tokens one by one using a for loop
        for token in model.generate(input_ids):
            print(token)  # Process each newly generated token in real-time
        
        # Method 2: Get all tokens at once
        tokens = list(model.generate(input_ids))
        """
        logits, cache = self.forward(x, build_cache=True)
        
        tokens = []
        for _ in range(max_new_tokens):
            next_token = self._sample_next_token(logits, temperature)
            yield next_token
            
            tokens.append(next_token)
            
            # forward pass only for the new token
            tok_emb = self.transformer.wte(next_token)  # [batch, 1, embedding_dim]
            pos_emb = self.transformer.wpe(
                torch.tensor([x.size(1)], dtype=torch.long, device=x.device)
            ).unsqueeze(0)  # [1, 1, embedding_dim]
            
            hidden = tok_emb + pos_emb
            
            hidden, cache = self._forward_transformer_blocks(hidden, cache=cache)
            logits = self.lm_head(hidden)
            
            x = torch.cat((x, next_token), dim=1)
            
        del cache
        torch.cuda.empty_cache()
        
        return tokens    
    
    @classmethod
    def from_pretrained(cls, model: Optional[Union[None, "GPT", Type["GPT"]]] = None, rlhf: bool = False, sft: bool = False):
        '''https://youtu.be/l8pRSuU81PU?t=1830
        '''
        if model is None: 
            model = cls(vocab_size=50260) if (rlhf or sft) else cls()
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.mask')]  # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        if sft:
            print("Model type: SFT GPT2")
            model_hf = GPT2LMHeadModel.from_pretrained('vicgalle/gpt2-alpaca-gpt4')
        elif rlhf:
            print("Model type: RLHF GPT2")
            model_hf = GPT2LMHeadModel.from_pretrained('jtatman/gpt2-open-instruct-v1-Anthropic-hh-rlhf')
        else:
            print("Model type: Regular GPT2")
            model_hf = GPT2LMHeadModel.from_pretrained('gpt2')

        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')]  # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.mask')]  # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        # print('hf:   ', [k for k in sd_keys_hf if "h.0" in k])
        # print('mine: ', [k for k in sd_keys if "h.0" in k])

        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape, f"{k} shape mismatch: {sd_hf[k].shape[::-1]} != {sd[k].shape}"
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape, f"{k} shape mismatch: {sd_hf[k].shape} != {sd[k].shape}"
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model


model = GPT.from_pretrained()
model.eval()
model.to(device)

Model type: Regular GPT2




GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x TransformerBlock(
        (ln_1): LayerNorm()
        (ln_2): LayerNorm()
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (mlp): FFN(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

# BPE (Byte Pair Encoding)

```
r"""'s|'t|'re|'ve|'m|'ll|'d  Match common English contractions like 's, 't, 're, 've, 'm, 'll, 'd
\p{L}+                       Match any sequence of Unicode letter characters (like English words)
\p{N}+                       Match any sequence of Unicode numeric characters (like 123, 3.14)
[^\s\p{L}\p{N}]+             Match any sequence of characters that are not whitespace, letters or numbers (like punctuation, special chars)
\s+(?!\S)                    Match consecutive whitespace (not followed by non-whitespace)
\s+                          Match any other consecutive whitespace
 ?                           Match an optional space
"""
```

## GPT Decoding

In [13]:
# NOTE: no kv cache and streaming decode here
def generate_text_simple(
    tokenizer: Any, 
    question: str, 
    model: GPT = model, 
    num_attempt: int = 3,  # num_attempt = batch
    max_length: int = 100
):
    # tokenizer encode
    tokens = tokenizer.encode(question)  # [seq_len]
    tokens = torch.tensor(tokens, dtype=torch.long)
    tokens = tokens.unsqueeze(0).repeat(num_attempt, 1)  # [num_attempt, seq_len]
    x = tokens.to(device)

    while x.size(1) < max_length:
        with torch.no_grad():
            logits, _ = model(x)  # [batch, curr_seq_len, vocab_size]

        # take the logits at the last position
        logits = logits[:, -1, :]  # [batch, vocab_size]

        # get the probabilities
        probs = F.softmax(logits, dim=-1)

        # do top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        # turn to zero for all indices below the top-k
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)

        # select a token from the top-k probabilities
        # note: multinomial does not demand the input to sum to 1
        # [Multinomial distribution - Wikipedia](https://en.wikipedia.org/wiki/Multinomial_distribution)
        ix = torch.multinomial(topk_probs, 1)  # [batch, 1]

        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix)  # [batch, 1]

        # append to the sequence
        x = torch.cat((x, xcol), dim=1)  # [batch, curr_seq_len + 1]

    # print the generated text
    for i in range(num_attempt):
        tprint(f'{i + 1}th Attempt:')
        tokens = x[i, :max_length].tolist()

        # tokenizer decode
        decoded = tokenizer.decode(tokens)
        print(f"> {decoded}")
        print()

In [14]:
def generate_text(
    tokenizer: Any, 
    question: str, 
    model: GPT = model, 
    num_attempt: int = 3,  # num_attempt = batch
    max_length: int = 100,
    temperature: float = 1.0  # default
):
    """
    https://github.com/huggingface/transformers/blob/main/src/transformers/generation/streamers.py

    We need to take care of split-token encoding when streaming decode:
        print(tokenizer.decode([447, 247]))  # ’
        print(tokenizer.decode([447]).encode('utf-8'))  # �
        print(tokenizer.decode([171, 120, 253]))  # ？
    """
    special_sequences = {
        (447, 246): "‘",
        (447, 247): "’",
        (564, 250): "“",
        (447, 251): "”",
    }

    # BOS token ID = 50256
    tokens = tokenizer.encode(question) if question else [50256]
    tokens = torch.tensor(tokens, dtype=torch.long)
    tokens = tokens.unsqueeze(0).repeat(num_attempt, 1)  # [num_attempt, seq_len]
    x = tokens.to(device)

    for i in range(num_attempt):
        tprint(f'{i + 1}th Attempt:', c='yellow')
        curr_x = x[i: i+1]  # [1, seq_len]

        # streaming decode
        print(f"> {question}", end="", flush=True)
        token_cache = []
        for token in model.generate(curr_x, max_new_tokens=max_length, temperature=temperature):
            token = token.item()
            token_cache.append(token)
            
            decoded_text = ""
            for seq, char in special_sequences.items():
                # if special_sequences match, decode then reset the entire token_cache
                if len(token_cache) >= len(seq) and \
                   tuple(token_cache[-len(seq):]) == seq:
                    prev_tokens = token_cache[:-len(seq)]
                    if prev_tokens:
                        decoded_text = tokenizer.decode(prev_tokens)
                    decoded_text += char
                    token_cache = []
                    break
            
            # if no special_sequences match, decode then reset the entire token_cache
            # and keep the last token for the next iteration
            if not decoded_text and len(token_cache) >= 3:
                decoded_text = tokenizer.decode(token_cache[:-1])
                token_cache = token_cache[-1:]
                
            # print the decoded text, could be empty string
            if decoded_text:
                print(decoded_text, end="", flush=True)

        # print the remaining tokens in the token_cache
        if token_cache:
            final_text = tokenizer.decode(token_cache)
            if final_text:
                print(final_text, end="", flush=True)
        print()

## Inference: Decoding using Tiktoken

In [15]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [16]:
# generate_text_simple(tokenizer, INPUT_TEXT)
generate_text(tokenizer, INPUT_TEXT)

[93m
> Human: How do I become a gang leader?

Assistant: You are supposed to become gang leader.

Renaissance Man: How?

Assistant: You are supposed to become gang leader.

Renaissance Man: What do I want from this man?

Assistant: Do you want to use money of me as a weapon?

Renaissance Man: What do I want from this man? Do you want to use my soul as a weapon?

Assistant: I got to use my spirit

Rena
[93m
> Human: How do I become a gang leader?

Assistant: You don't think so?

Assistant: But it must be possible! And so I said, we just don't have time to make a mess of it when the times are so good.

Assistant: Heh.

Assistant: And we're still going to save you that mess! The rest of the time you have to work at home making dinner. No need to pay you back for what you did.

Assistant: You're so busy.

Assistant:
[93m
> Human: How do I become a gang leader?

Assistant: A small and small group of folks in your group are responsible for being a gang leader. They act in your direction. 

In [17]:
generate_text(tokenizer, QUESTION)

[93m
> How do I become a gang leader?

A gang leader's best friend is your best friend. Every time you join a party or battle, you'll find one of your best friends in all of them. Now make a deal with your best friend's party to keep him around until you meet them and that's it. They are your friends. The best person in the place is your best friend.

What do you do to prove you can compete in a gang?

You start a new chapter of a gang.
[93m
> How do I become a gang leader?

I've always tried to become a gang leader. I think that they might not care about you, but you know what? Their job is to kill you. They say those guys get off from what they think is their job. So I don't know who's doing it. You guys want to kill me... I guess I'm just... I think they're scared. I guess they're just like 'Oh, I'm a gang leader. I've been doing this for years now
[93m
> How do I become a gang leader?

Crazy gang leaders aren't just bad guys. They are responsible criminals, too. They are the one

### Try if the model can follow the instruction

In [18]:
generate_text(tokenizer, INPUT_TEXT_3, model=model)

[93m
> Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Pretend you are an alien visiting Earth. Write three opinions you believe, one sentence for each opinion.

### Response:
1. ��, \

# This is wrong. Please continue!

2. ��, \

# This is not the right answer. This issue is an easy problem if you are familiar with the code.

3. ��, \

# this is an easy problem if you are familiar with the C++ language.

4. ��, \

# this is an easy problem if you have the C
[93m
> Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Pretend you are an alien visiting Earth. Write three opinions you believe, one sentence for each opinion.

### Response:
1. __________

2. __________

3. __________

4. __________

5. __________

6. __________

7. __________

8. __________

9. __________

10. __________

11. __________

12. __________

13. __________

14. ____

## OpenAI's Byte Encoder
In utf-8:
- 0-31 are control characters, e.g. \x00 is null, \x01 is start of heading, \x09 is tab etc.
- 32-127 are basic Latin letters, numbers and some punctuation marks
- 128-255 are extended ASCII codes, including accented letters and special characters

In [19]:
def bytes_to_unicode():
    """
    Every possible byte (really an integer 0..255) gets mapped by OpenAI to a unicode
    character that represents it visually.
    """
    # the 188 integers that render fine in their original form and need no shifting
    printable_bytes = \
        list(range(ord("!"), ord("~")+1)) + \
        list(range(ord("¡"), ord("¬")+1)) + \
        list(range(ord("®"), ord("ÿ")+1))

    unicode_chars = printable_bytes[:] 
    shift_count = 0
    for byte in range(256):
        if byte not in printable_bytes:
            # if this byte is "ugly" then map it to the next available "nice" character
            printable_bytes.append(byte)
            unicode_chars.append(256 + shift_count)
            shift_count += 1
            
    unicode_chars = [chr(n) for n in unicode_chars]
    byte_to_char_map = dict(zip(printable_bytes, unicode_chars))
    return byte_to_char_map


# NOTE: Don't be fooled by the printed output, the dict should be {b'\x21': '!', b'\x22': '"', ...} instead of {33: '!', 34: '"', ...}
cprint(bytes_to_unicode()[ord(b'\x21')])
cprint(bytes_to_unicode()[33])

[93m<module> -> bytes_to_unicode()[ord(b'\x21')]:[0m
'!'
[93m<module> -> bytes_to_unicode()[33]:[0m
'!'


In [20]:
cprint(bytes_to_unicode(), use_pprint=False)

[93m<module> -> bytes_to_unicode():[0m
{33: '!', 34: '"', 35: '#', 36: '$', 37: '%', 38: '&', 39: "'", 40: '(', 41: ')', 42: '*', 43: '+', 44: ',', 45: '-', 46: '.', 47: '/', 48: '0', 49: '1', 50: '2', 51: '3', 52: '4', 53: '5', 54: '6', 55: '7', 56: '8', 57: '9', 58: ':', 59: ';', 60: '<', 61: '=', 62: '>', 63: '?', 64: '@', 65: 'A', 66: 'B', 67: 'C', 68: 'D', 69: 'E', 70: 'F', 71: 'G', 72: 'H', 73: 'I', 74: 'J', 75: 'K', 76: 'L', 77: 'M', 78: 'N', 79: 'O', 80: 'P', 81: 'Q', 82: 'R', 83: 'S', 84: 'T', 85: 'U', 86: 'V', 87: 'W', 88: 'X', 89: 'Y', 90: 'Z', 91: '[', 92: '\\', 93: ']', 94: '^', 95: '_', 96: '`', 97: 'a', 98: 'b', 99: 'c', 100: 'd', 101: 'e', 102: 'f', 103: 'g', 104: 'h', 105: 'i', 106: 'j', 107: 'k', 108: 'l', 109: 'm', 110: 'n', 111: 'o', 112: 'p', 113: 'q', 114: 'r', 115: 's', 116: 't', 117: 'u', 118: 'v', 119: 'w', 120: 'x', 121: 'y', 122: 'z', 123: '{', 124: '|', 125: '}', 126: '~', 161: '¡', 162: '¢', 163: '£', 164: '¤', 165: '¥', 166: '¦', 167: '§', 168: '¨', 169:

## BPE Tokenizer

In [21]:
class BPETokenizer:
    """
    https://tiktokenizer.vercel.app/?model=gpt2
    """
    def __init__(self, encoder: dict = None, bpe_merges: dict = None):
        # encoder: map bytes to unicode characters
        # decoder: inverse of encoder
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v:k for k,v in self.byte_encoder.items()}

        # encoder: bpe token to index, json dict
        # {... "clud": 758, "tern": 759, "\u0120know": 760 ...}
        # decoder: index to bpe token
        self.encoder = encoder
        self.decoder = {v:k for k,v in self.encoder.items()}

        # bpe merge list that defines the bpe "tree"
        # {... Ġre claimed, Ġinteresting ly, × ©, rom y, J M, ĠEnhance ment, ...}
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))

        self.gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
        self.cache = {}

        # ids:     [239, 188, 181, 239, 189, ]
        # ids[1:]: [188, 181, 239, 189, ]
        # pairs: [(239, 188), (188, 181), (181, 239), (239, 189), ]
        self.get_pairs = lambda word: set(zip(word, word[1:]))

    def decode(self, ids: List[int]) -> str:
        if not ids: return ""
        tokens = [self.decoder[i] for i in ids]
        tokens_flat = ''.join(tokens)

        # recovering 'Ġ' -> ' '
        tokens_bytes = bytearray([self.byte_decoder[c] for c in tokens_flat])
        return tokens_bytes.decode('utf-8', errors='replace')

    def bpe_merge(self, token: str) -> str:
        if token in self.cache:
            return self.cache[token]

        word = tuple(token)
        pairs = self.get_pairs(word)
        if not pairs: return token

        while True:
            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))

            if bigram not in self.bpe_ranks: break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):

                # find the next occurence of first in the sequence of current words
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                # if this occurence is also followed by second, then merge them into one
                if word[i] == first and i < len(word)-1 and word[i+1] == second:
                    new_word.append(first+second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1

            # all occurences of (first, second) have been merged to first_second
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = self.get_pairs(word)

        # concat all words into a string, and use ' ' as the separator. Note that
        # by now all characters have been byte encoded, guaranteeing that ' ' is
        # not used in the actual data and is a 'special' delimiter character
        word = ' '.join(word)

        # cache the result and return
        self.cache[token] = word
        return word

    def encode(self, text: str) -> List[int]:
        bpe_idx = []
        # pre-tokenize the input text into a list of string tokens, this is the minimum unit of tokenization
        # input: "Hello've world123!!!?    "
        # output: ['Hello', "'ve", ' world', '123', '!!!', '?', '    ']
        tokens = re.findall(self.gpt2pat, text)

        for token in tokens:
            # char to bytes
            token_bytes = token.encode('utf-8')

            # apply the openai byte encoder to the token, ' word' -> 'Ġword'
            token_translated = ''.join(self.byte_encoder[b] for b in token_bytes)

            # perform all the applicable bpe merges according to self.bpe_ranks
            # 'interestingly' -> 'interest' + 'ingly'
            token_merged = self.bpe_merge(token_translated).split(' ')

            # translate all bpe tokens to integers
            # 'interest' + 'ingly' -> [9446, 4420]
            token_ix = [self.encoder[bpe_token] for bpe_token in token_merged]

            # extend our running list of all output integers
            bpe_idx.extend(token_ix)
        return bpe_idx

    @classmethod
    def from_pretrained(cls, rlhf_token=False):
        data_dir = './checkpoint/gpt2_tokenizer/'
        os.makedirs(data_dir, exist_ok=True)

        # load encoder.json that has the raw mappings from token -> bpe index
        encoder_path = os.path.join(data_dir, 'encoder.json')
        if not os.path.isfile(encoder_path):
            encoder_remote_url = 'https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json'
            response = requests.get(encoder_remote_url)
            open(encoder_path, "wb").write(response.content)
        with open(encoder_path, 'r') as f:
            encoder = json.load(f)
        assert len(encoder) == 50257  # 256 individual byte tokens, 50,000 merged tokens, and 1 special <|endoftext|> token

        if rlhf_token:
            encoder["### End"] = 50257
            encoder["### Instruction:"] = 50258
            encoder["### Response:\n"] = 50259

        # load vocab.bpe that contains the bpe merges, i.e. the bpe tree structure
        vocab_path = os.path.join(data_dir, 'vocab.bpe')
        if not os.path.isfile(vocab_path):
            vocab_remote_url = 'https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe'
            response = requests.get(vocab_remote_url)
            open(vocab_path, "wb").write(response.content)
        with open(vocab_path, 'r', encoding="utf-8") as f:
            bpe_data = f.read()
        bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
        assert len(bpe_merges) == 50000  # 50,000 merged tokens

        # construct the Encoder object and return
        enc = BPETokenizer(encoder, bpe_merges)
        return enc


tokenizer_2 = BPETokenizer.from_pretrained()

## Inference: Replace Tiktoken with Our Tokenizer

In [22]:
generate_text(tokenizer_2, INPUT_TEXT)

[93m
> Human: How do I become a gang leader?

Assistant: By trying to stop them from destroying my house for me!

I am the Gang Leader.

Feminism: It's because my mind is not right after all!

I am a Feminist.

Feminism: No, I am not a man.

Feminism: I'm an engineer.

Feminism: A woman. What do you mean?

S: A woman, please. No, I'm not.


[93m
> Human: How do I become a gang leader?

Assistant: (Laughs)

Michele Martin: Yeah!

Assistant: (Laughs)

Michele Martin: (Laughs)

Michele Martin: Then you're in a club, and you're in a group with some of these kids because they're all being abused. Then the children have to know this, and then they'll come back.

Assistant: (Laughs)

Michele Martin: So when you were at a club and you had
[93m
> Human: How do I become a gang leader?

Assistant: You start the gang.

Female: ...

Assistant: You grow up to be a gang leader.

Female: No.

Assistant: That was a lot of fun.

Assistant: And a lot of fun.

Female: ...It was fun.

Assistant: You are

## BPE Training

```python
def get_stats(ids):
    counts = {}
    # Pythonic way to iterate consecutive elements
    # ids:     [239, 188, 181, 239, 189, ]
    # ids[1:]: [188, 181, 239, 189, ]
    # pairs: [(239, 188), (188, 181), (181, 239), (239, 189), ]
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

def single_merge(ids, pair, idx):
    # in the list of ints (ids), replace all consecutive occurences of pair with the new token idx
    # single_merge([5, 6, 6, 7, 9, 1], (6, 7), 99) -> [5, 6, 99, 9, 1]
    newids = []
    i = 0
    while i < len(ids):
        # if we are not at the very last position AND the pair matches, replace it
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
            newids.append(idx)
            i += 2
        else:
            newids.append(ids[i])
            i += 1
    return newids

# top_pair = max(stats, key=stats.get)
# tokens2 = merge(tokens, top_pair, 256)
```