### Byte Pair Encoding (BPE)
• Example corpus:
old finest older finest old lowest finest finest finest old finest old older
finest old lowest older old finest old lowest finest
• Pre-tokenization (add a special symbol _ at the end to
identify word boundary)
• old_: 7
• older_: 3
• finest_: 9
• lowest_: 4

In [None]:
# from collections import Counter

# text = "old finest older finest old lowest finest finest finest old finest old older finest old lowest older old finest old lowest finest"


In [10]:
from collections import Counter, defaultdict

def get_stats(vocab):
    """
    Get frequency of adjacent character pairs in the vocabulary.
    """
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pair = (symbols[i], symbols[i + 1])
            pairs[pair] += freq
    return pairs

def merge_vocab(pair, vocab):
    """
    Merge the most frequent pair in the vocabulary.
    """
    new_vocab = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    for word in vocab:
        # Replace the bigram with the merged token
        new_word = word.replace(bigram, replacement)
        new_vocab[new_word] = vocab[word]
    return new_vocab

# Initialize the vocabulary with character-level tokens
corpus = "banana_bandana_"
vocab = Counter([' '.join(corpus)])  # Split characters with space

print("Initial Vocabulary:", vocab)

num_merges = 6  # Number of merges to perform
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    # Get the most frequent pair
    most_frequent = max(pairs, key=pairs.get)
    print(f"Step {i + 1}: Most frequent pair: {most_frequent}")
    
    # Merge the pair in the vocabulary
    vocab = merge_vocab(most_frequent, vocab)
    print(f"Step {i + 1}: Vocabulary: {vocab}")

# Extract the final set of subword units
subwords = set(''.join(word.split()) for word in vocab)
print("\nFinal set of subword units:", subwords)


Initial Vocabulary: Counter({'b a n a n a _ b a n d a n a _': 1})
Step 1: Most frequent pair: ('a', 'n')
Step 1: Vocabulary: {'b an an a _ b an d an a _': 1}
Step 2: Most frequent pair: ('b', 'an')
Step 2: Vocabulary: {'ban an a _ ban d an a _': 1}
Step 3: Most frequent pair: ('an', 'a')
Step 3: Vocabulary: {'banan a _ ban d ana _': 1}
Step 4: Most frequent pair: ('banan', 'a')
Step 4: Vocabulary: {'banana _ ban d ana _': 1}
Step 5: Most frequent pair: ('banana', '_')
Step 5: Vocabulary: {'banana_ ban d ana _': 1}
Step 6: Most frequent pair: ('banana_', 'ban')
Step 6: Vocabulary: {'banana_ban d ana _': 1}

Final set of subword units: {'banana_bandana_'}
