### BPE

Tokenizer (including GPT-4o): https://tiktokenizer.vercel.app/?model=gpt2

In [1]:
!pip install tiktoken -q

In [2]:
sentence = "Is the distance between Bengaluru and Delhi more than 2000 kms?"

In [3]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = tokenizer.encode(sentence)
print("Token IDs:", token_ids)
decoded_tokens = [tokenizer.decode([token_id]) for token_id in token_ids]
print("Tokens:", decoded_tokens)

Token IDs: [3792, 262, 5253, 1022, 28630, 14717, 290, 12517, 517, 621, 4751, 479, 907, 30]
Tokens: ['Is', ' the', ' distance', ' between', ' Bengal', 'uru', ' and', ' Delhi', ' more', ' than', ' 2000', ' k', 'ms', '?']


In [4]:
import tiktoken
tokenizer = tiktoken.get_encoding("o200k_base") # GPT-4o mini
token_ids = tokenizer.encode(sentence)
print("Token IDs:", token_ids)
decoded_tokens = [tokenizer.decode([token_id]) for token_id in token_ids]
print("Tokens:", decoded_tokens)

Token IDs: [3031, 290, 9324, 2870, 174589, 326, 30076, 945, 1572, 220, 1179, 15, 109434, 30]
Tokens: ['Is', ' the', ' distance', ' between', ' Bengaluru', ' and', ' Delhi', ' more', ' than', ' ', '200', '0', ' kms', '?']


In [5]:
tokenizer.decode([64])

'a'

In [6]:
import unicodedata

def get_stats(ids, counts=None):
    counts = {} if counts is None else counts
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

def merge(ids, pair, idx):
    newids = []
    i = 0
    while i < len(ids):
        if ids[i] == pair[0] and i < len(ids) - 1 and ids[i + 1] == pair[1]:
            newids.append(idx)
            i += 2
        else:
            newids.append(ids[i])
            i += 1
    return newids

def train(text, vocab_size, verbose=False):
    assert vocab_size >= 256
    num_merges = vocab_size - 256
    if verbose:
      print(f'Text: {list(text)}')
    text_bytes = text.encode("utf-8")
    ids = list(text_bytes)
    if verbose:
      print(f'ids: {ids}')
    merges = {}
    vocab = {idx: bytes([idx]) for idx in range(256)}
    for i in range(num_merges):
      if verbose:
        print(f'\nIteration {str(i+1)}:')
      stats = get_stats(ids)
      temp = [{(decode([int(k[0])],vocab),decode([int(k[1])],vocab)):v} for k,v in stats.items()]
      if verbose:
        print(f'Frequency of pairs: {temp}')
      pair = max(stats, key=stats.get)
      idx = 256 + i
      if verbose:
        print(f'Merging pairs: {(decode([int(pair[0])],vocab),decode([int(pair[1])],vocab))} ==> {idx}')
      ids = merge(ids, pair, idx)
      if verbose:
        print(f'Merged ids: {ids}')
      merges[pair] = idx
      vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
      toks = [decode([int(id)],vocab) for id in ids]
      if verbose:
        print(f'Compressed Text: {toks}')
    return merges, vocab

def encode(text, merges):
    text_bytes = text.encode("utf-8")
    ids = list(text_bytes)
    while len(ids) >= 2:
        stats = get_stats(ids)
        pair = min(stats, key=lambda p: merges.get(p, float("inf")))
        if pair not in merges:
            break
        idx = merges[pair]
        ids = merge(ids, pair, idx)
    return ids

def decode(ids, vocab):
    text_bytes = b"".join(vocab[idx] for idx in ids)
    text = text_bytes.decode("utf-8", errors="replace")
    return text

In [7]:
mergesRequired=3
merges, vocab = train('pay papaya', vocab_size = 256+mergesRequired, verbose=True)
print(f'\nMerged ids to new id: {list(merges.items())[-mergesRequired:]}')
print(f'\nNew vocabulary (id, byte):{list(vocab.items())[-mergesRequired:]}')

Text: ['p', 'a', 'y', ' ', 'p', 'a', 'p', 'a', 'y', 'a']
ids: [112, 97, 121, 32, 112, 97, 112, 97, 121, 97]

Iteration 1:
Frequency of pairs: [{('p', 'a'): 3}, {('a', 'y'): 2}, {('y', ' '): 1}, {(' ', 'p'): 1}, {('a', 'p'): 1}, {('y', 'a'): 1}]
Merging pairs: ('p', 'a') ==> 256
Merged ids: [256, 121, 32, 256, 256, 121, 97]
Compressed Text: ['pa', 'y', ' ', 'pa', 'pa', 'y', 'a']

Iteration 2:
Frequency of pairs: [{('pa', 'y'): 2}, {('y', ' '): 1}, {(' ', 'pa'): 1}, {('pa', 'pa'): 1}, {('y', 'a'): 1}]
Merging pairs: ('pa', 'y') ==> 257
Merged ids: [257, 32, 256, 257, 97]
Compressed Text: ['pay', ' ', 'pa', 'pay', 'a']

Iteration 3:
Frequency of pairs: [{('pay', ' '): 1}, {(' ', 'pa'): 1}, {('pa', 'pay'): 1}, {('pay', 'a'): 1}]
Merging pairs: ('pay', ' ') ==> 258
Merged ids: [258, 256, 257, 97]
Compressed Text: ['pay ', 'pa', 'pay', 'a']

Merged ids to new id: [((112, 97), 256), ((256, 121), 257), ((257, 32), 258)]

New vocabulary (id, byte):[(256, b'pa'), (257, b'pay'), (258, b'pay ')]


In [8]:
mergesRequired=3
merges, vocab = train('25 pay 5 papaya', vocab_size = 256+mergesRequired, verbose=True)
print(f'\nMerged ids to new id: {list(merges.items())[-mergesRequired:]}')
print(f'\nNew vocabulary (id, byte):{list(vocab.items())[-mergesRequired:]}')

Text: ['2', '5', ' ', 'p', 'a', 'y', ' ', '5', ' ', 'p', 'a', 'p', 'a', 'y', 'a']
ids: [50, 53, 32, 112, 97, 121, 32, 53, 32, 112, 97, 112, 97, 121, 97]

Iteration 1:
Frequency of pairs: [{('2', '5'): 1}, {('5', ' '): 2}, {(' ', 'p'): 2}, {('p', 'a'): 3}, {('a', 'y'): 2}, {('y', ' '): 1}, {(' ', '5'): 1}, {('a', 'p'): 1}, {('y', 'a'): 1}]
Merging pairs: ('p', 'a') ==> 256
Merged ids: [50, 53, 32, 256, 121, 32, 53, 32, 256, 256, 121, 97]
Compressed Text: ['2', '5', ' ', 'pa', 'y', ' ', '5', ' ', 'pa', 'pa', 'y', 'a']

Iteration 2:
Frequency of pairs: [{('2', '5'): 1}, {('5', ' '): 2}, {(' ', 'pa'): 2}, {('pa', 'y'): 2}, {('y', ' '): 1}, {(' ', '5'): 1}, {('pa', 'pa'): 1}, {('y', 'a'): 1}]
Merging pairs: ('5', ' ') ==> 257
Merged ids: [50, 257, 256, 121, 32, 257, 256, 256, 121, 97]
Compressed Text: ['2', '5 ', 'pa', 'y', ' ', '5 ', 'pa', 'pa', 'y', 'a']

Iteration 3:
Frequency of pairs: [{('2', '5 '): 1}, {('5 ', 'pa'): 2}, {('pa', 'y'): 2}, {('y', ' '): 1}, {(' ', '5 '): 1}, {('pa', 'pa

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
# Trained for Vocab of 512
text = open("/content/drive/MyDrive/india.txt", "r", encoding="utf-8").read()
merges, vocab = train(text, vocab_size=512, verbose=False)
token_ids = encode(sentence, merges)
tokens = [decode([tokenID], vocab) for tokenID in token_ids]
print('Tokens generated:', tokens)
print('Decoded sentence:', decode(token_ids, vocab))

Tokens generated: ['I', 's ', 'the ', 'di', 'st', 'anc', 'e ', 'b', 'et', 'w', 'e', 'en ', 'B', 'en', 'g', 'al', 'ur', 'u', ' and ', 'D', 'el', 'hi', ' m', 'or', 'e ', 'th', 'an ', '20', '00 ', 'k', 'm', 's', '?']
Decoded sentence: Is the distance between Bengaluru and Delhi more than 2000 kms?


In [14]:
# Trained for Vocab of 1024
text = open("/content/drive/MyDrive/india.txt", "r", encoding="utf-8").read()
merges, vocab = train(text, vocab_size=1024, verbose=False)
token_ids = encode(sentence, merges)
tokens = [decode([tokenID], vocab) for tokenID in token_ids]
print('Tokens generated:', tokens)
print('Decoded sentence:', decode(token_ids, vocab))

Tokens generated: ['I', 's ', 'the ', 'di', 'st', 'ance ', 'between ', 'Beng', 'al', 'ur', 'u', ' and ', 'Delhi', ' m', 'or', 'e ', 'than ', '20', '00 ', 'k', 'm', 's', '?']
Decoded sentence: Is the distance between Bengaluru and Delhi more than 2000 kms?


In [None]:
# Difference between RegexTokeniser - GPT-3

### WordPiece

In [15]:
# Likelihood score instead of frequency
# standard special characters like !?@~
# special tokens used for BERT, ex : [SEP], [CLS], [MASK], [UNK], [PAD], [EOS]

In [16]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# https://www.tensorflow.org/text/api_docs/python/text/BertTokenizer
token_ids = tokenizer.encode(sentence)
print("Token IDs:", token_ids)
decoded_tokens = tokenizer.convert_ids_to_tokens(token_ids)
print("Tokens:", decoded_tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Token IDs: [101, 2003, 1996, 3292, 2090, 8191, 14129, 1998, 6768, 2062, 2084, 2456, 2463, 2015, 1029, 102]
Tokens: ['[CLS]', 'is', 'the', 'distance', 'between', 'bengal', '##uru', 'and', 'delhi', 'more', 'than', '2000', 'km', '##s', '?', '[SEP]']


In [17]:
import math
from collections import defaultdict

def get_token_probabilities(vocab):
    """Calculates probabilities of tokens in the vocabulary.

    Args:
        vocab: A dictionary where keys are tokens (strings) and values are counts.

    Returns:
        A dictionary where keys are tokens (strings) and values are their probabilities.
    """
    total_count = sum(vocab.values())
    return {token: count / total_count if total_count > 0 else 0 for token, count in vocab.items()} # Added a check to handle an empty vocabulary

def get_stats_wordpiece(ids, vocab, vocab_id_reverse):
    """Calculates pair statistics for WordPiece, considering likelihood.

    Args:
        ids: A list of token IDs representing the text.
        vocab: A dictionary of token counts {token_string: count}.
        vocab_id_reverse: A dictionary for reverse lookup of id to token string {id: token_string}

    Returns:
        A dictionary where keys are pairs of token IDs and values are their likelihood scores.
    """
    token_probabilities = get_token_probabilities(vocab)

    pair_counts = defaultdict(int)
    pair_scores = {}

    for i in range(len(ids) - 1):
        pair = (ids[i], ids[i + 1])
        pair_counts[pair] += 1

    for pair, count in pair_counts.items():
        token1 = pair[0]
        token2 = pair[1]
        token1_str = vocab_id_reverse[token1] if isinstance(token1, int) else token1
        token2_str = vocab_id_reverse[token2] if isinstance(token2, int) else token2

        combined_token = token1_str + token2_str

        prob_combined = vocab.get(combined_token, 0) / sum(vocab.values()) if vocab.get(combined_token, 0) != 0 else 1e-7
        prob1 = token_probabilities.get(token1_str, 1e-7)
        prob2 = token_probabilities.get(token2_str, 1e-7)

        score = prob_combined / (prob1 * prob2) if prob1 != 0 and prob2 != 0 else 0

        pair_scores[pair] = score

    return pair_scores

def merge_wordpiece(ids, pair, new_token, vocab, vocab_id_reverse):
    """Merges a pair of tokens in the list of IDs and Updates vocabulary.

    Args:
        ids: A list of token IDs representing the text.
        pair: A tuple of two token IDs representing the pair to merge.
        new_token: The new token string representing the merged pair.
        vocab: The vocabulary dictionary {token_string: count}.
        vocab_id_reverse: The vocabulary id reverse dictionary {id : token_string}.

    Returns:
        A new list of token IDs with the pair merged, and updated vocab and vocab_id_reverse.
    """
    new_ids = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
            new_ids.append(new_token)  # Append new token string
            i += 2
        else:
            new_ids.append(ids[i])
            i += 1
    # Add the merged tokens to vocab and vocab_id_reverse
    if new_token not in vocab:
        new_id = max(vocab_id_reverse.keys()) + 1 if vocab_id_reverse else 0
        vocab[new_token] = 0 # Initialize count for new token
    for i in range(len(new_ids)):
        if(new_ids[i] == new_token):
            vocab[new_token] +=1
    if new_token not in vocab_id_reverse.values():
      new_id = max(vocab_id_reverse.keys()) + 1 if vocab_id_reverse else 0
      vocab_id_reverse[new_id] = new_token

    new_ids_int = []
    for i in range(len(new_ids)):
        if isinstance(new_ids[i], int):
            new_ids_int.append(new_ids[i])
        else:
            key = [key for key, value in vocab_id_reverse.items() if value == new_ids[i]]
            if(key):
                new_ids_int.append(key[0])
            else:
                print("Error")

    return new_ids_int, vocab, vocab_id_reverse

def train_wordpiece(text, vocab_size, verbose=False):
    """Trains a WordPiece tokenizer.

        Args:
            text: The input text string.
            vocab_size: The target vocabulary size.
            verbose: Whether to print progress information.

        Returns:
            A tuple containing:
              - merges: A dictionary where keys are pairs of token IDs and values
              are the new token string they are merged into
              - vocab: A dictionary of token counts {token_string: count}
               - vocab_id_reverse : A dictionary of token id reverse map id: token
        """

    # 1. Initial Vocabulary (with counts)
    vocab = defaultdict(int)
    for char in text:
        vocab[char] += 1

    # Initialize vocab_id_reverse here
    vocab_id_reverse = {idx: char for idx, char in enumerate(vocab.keys())}

    # 2. Initial IDs
    ids = [ [key for key, value in vocab_id_reverse.items() if value == char ][0]for char in text]

    merges = {}
    num_merges = vocab_size - len(vocab)

    for i in range(num_merges):
        # 3. Get Pair Statistics (using likelihood)
        pair_scores = get_stats_wordpiece(ids, vocab, vocab_id_reverse)
        if not pair_scores:
            break  # Stop if no more pairs can be merged

        # 4. Find Best Pair
        pair = max(pair_scores, key=pair_scores.get)

        token1 = pair[0]
        token2 = pair[1]
        token1_str = vocab_id_reverse[token1] if isinstance(token1, int) else token1
        token2_str = vocab_id_reverse[token2] if isinstance(token2, int) else token2

        # 5. Create New Token
        new_token = token1_str + token2_str

        # 6. Merge IDs and Update Vocabulary
        ids, vocab, vocab_id_reverse = merge_wordpiece(ids, pair, new_token, vocab, vocab_id_reverse)

        # 7. Store Merge
        merges[pair] = new_token

        if verbose:
            token1_for_print = vocab_id_reverse[pair[0]] if isinstance(pair[0],int) else pair[0]
            token2_for_print = vocab_id_reverse[pair[1]] if isinstance(pair[1],int) else pair[1]
            print(
                f"Merge {i + 1}/{num_merges}: ({token1_for_print}, {token2_for_print}) -> {new_token} (likelihood: {pair_scores[pair]:.4f})"
            )

    return merges, vocab, vocab_id_reverse

# Example Usage
text = "pay papaya pay papaya pay papaya pay papaya"
vocab_size = 5
merges, vocab, vocab_id_reverse = train_wordpiece(text, vocab_size, verbose=True)
print("Final Merges:", merges)
print("Final Vocab:", vocab)
print("Final Vocab reverse:", vocab_id_reverse)

Merge 1/1: (y,  ) -> y  (likelihood: 0.0000)
Final Merges: {(2, 3): 'y '}
Final Vocab: defaultdict(<class 'int'>, {'p': 12, 'a': 16, 'y': 8, ' ': 7, 'y ': 4})
Final Vocab reverse: {0: 'p', 1: 'a', 2: 'y', 3: ' ', 4: 'y '}


### Transformers

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

In [None]:
embedding_matrix = model.transformer.wte.weight
embedding_matrix[0]

In [None]:
len(embedding_matrix) # Total tokens in the vocabulary pf GPT-2

In [None]:
sentence = "BPE BPE & WordPiece are tokenization methods used in GPT & BERT respectively."
inputs = tokenizer(sentence, return_tensors='pt')
input_ids = inputs['input_ids']
input_ids

In [None]:
decoded_sentence = tokenizer.decode([11571], skip_special_tokens=True)
print("Decoded Sentence:\n", f'---{decoded_sentence}---')

In [None]:
# Token embeddings
token_embeddings = model.transformer.wte(input_ids)
print("Token Embeddings:\n", token_embeddings)

In [None]:
len(token_embeddings[0])

In [None]:
token_embeddings[0]

In [None]:
import torch.nn.functional as F
def decode_embedding_to_token(embedding, model, tokenizer):
    embedding_matrix = model.transformer.wte.weight
    similarities = F.cosine_similarity(embedding, embedding_matrix, dim=-1)
    token_id = torch.argmax(similarities).item()
    token = tokenizer.decode([token_id])
    return token, token_id

In [None]:
embedding = token_embeddings[0][1][:]
token, token_id = decode_embedding_to_token(embedding, model, tokenizer)

print(f"Decoded Token: {token}")
print(f"Token ID: {token_id}")

In [None]:
# Positional encodings
position_ids = torch.arange(0, input_ids.size(-1), dtype=torch.long).unsqueeze(0)
position_ids

In [None]:
positional_encodings = model.transformer.wpe(position_ids)
print("Positional Encodings:\n", positional_encodings)

In [None]:
len(positional_encodings[0][0])

In [None]:
def decode_positional_embedding(positional_embedding, model):
    positional_embedding_matrix = model.transformer.wpe.weight
    similarities = F.cosine_similarity(positional_embedding, positional_embedding_matrix, dim=-1)
    position = torch.argmax(similarities).item()
    return position

In [None]:
positional_embedding = positional_encodings[0][0][:]
position = decode_positional_embedding(positional_embedding, model)

print(f"Decoded Position: {position}")

In [None]:
import numpy as np
import torch

def get_positional_encoding(seq_len, d_model):
    positional_encoding = np.zeros((seq_len, d_model))
    for pos in range(seq_len):
        for i in range(0, d_model, 2):
            positional_encoding[pos, i] = np.sin(pos / (10000 ** ((2 * i) / d_model)))
            if i + 1 < d_model:
                positional_encoding[pos, i + 1] = np.cos(pos / (10000 ** ((2 * i) / d_model)))

    return torch.tensor(positional_encoding, dtype=torch.float32)

seq_len = 21
d_model = 768
positional_encoding = get_positional_encoding(seq_len, d_model)
print(positional_encoding)

In [None]:
# Combine token embeddings and positional encodings
combined_embeddings = token_embeddings + positional_encodings
print("Combined Embeddings:\n", combined_embeddings)