# BytePair Encoding Tokenization



In [1]:
import collections

# Initialize the corpus
corpus = ["Betty", "Botter", "had", "some","butter"]

# Break each word into characters with an end-of-word token
corpus = [list(word) + ['</w>'] for word in corpus]

# Display initial corpus
print(corpus)


[['B', 'e', 't', 't', 'y', '</w>'], ['B', 'o', 't', 't', 'e', 'r', '</w>'], ['h', 'a', 'd', '</w>'], ['s', 'o', 'm', 'e', '</w>'], ['b', 'u', 't', 't', 'e', 'r', '</w>']]


## Frequency Count

In [2]:
def get_stats(corpus):
    pairs = collections.defaultdict(int)
    for word in corpus:
        for i in range(len(word)-1):
            pairs[(word[i], word[i+1])] += 1
    return pairs

# Count pair frequencies
pairs = get_stats(corpus)
print(pairs)


defaultdict(<class 'int'>, {('B', 'e'): 1, ('e', 't'): 1, ('t', 't'): 3, ('t', 'y'): 1, ('y', '</w>'): 1, ('B', 'o'): 1, ('o', 't'): 1, ('t', 'e'): 2, ('e', 'r'): 2, ('r', '</w>'): 2, ('h', 'a'): 1, ('a', 'd'): 1, ('d', '</w>'): 1, ('s', 'o'): 1, ('o', 'm'): 1, ('m', 'e'): 1, ('e', '</w>'): 1, ('b', 'u'): 1, ('u', 't'): 1})


## Merging Pairs

In [3]:
def merge_pair(pair, corpus):
    new_corpus = []
    for word in corpus:
        new_word = []
        i = 0
        while i < len(word):
            if i < len(word)-1 and (word[i], word[i+1]) == pair:
                new_word.append(word[i] + word[i+1])
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        new_corpus.append(new_word)
    return new_corpus

# Merge the most frequent pair ('l', 'o')
most_freq_pair = max(pairs, key=pairs.get)
corpus = merge_pair(most_freq_pair, corpus)
print(corpus)


[['B', 'e', 'tt', 'y', '</w>'], ['B', 'o', 'tt', 'e', 'r', '</w>'], ['h', 'a', 'd', '</w>'], ['s', 'o', 'm', 'e', '</w>'], ['b', 'u', 'tt', 'e', 'r', '</w>']]


# Final Step

In [4]:
# Continue merging until no significant pair remains
num_merges = 10  # Set limit on merges
for i in range(num_merges):
    pairs = get_stats(corpus)
    if not pairs:
        break
    most_freq_pair = max(pairs, key=pairs.get)
    corpus = merge_pair(most_freq_pair, corpus)
    print(f"After merge {i+1}: {corpus}")


After merge 1: [['B', 'e', 'tt', 'y', '</w>'], ['B', 'o', 'tte', 'r', '</w>'], ['h', 'a', 'd', '</w>'], ['s', 'o', 'm', 'e', '</w>'], ['b', 'u', 'tte', 'r', '</w>']]
After merge 2: [['B', 'e', 'tt', 'y', '</w>'], ['B', 'o', 'tter', '</w>'], ['h', 'a', 'd', '</w>'], ['s', 'o', 'm', 'e', '</w>'], ['b', 'u', 'tter', '</w>']]
After merge 3: [['B', 'e', 'tt', 'y', '</w>'], ['B', 'o', 'tter</w>'], ['h', 'a', 'd', '</w>'], ['s', 'o', 'm', 'e', '</w>'], ['b', 'u', 'tter</w>']]
After merge 4: [['Be', 'tt', 'y', '</w>'], ['B', 'o', 'tter</w>'], ['h', 'a', 'd', '</w>'], ['s', 'o', 'm', 'e', '</w>'], ['b', 'u', 'tter</w>']]
After merge 5: [['Bett', 'y', '</w>'], ['B', 'o', 'tter</w>'], ['h', 'a', 'd', '</w>'], ['s', 'o', 'm', 'e', '</w>'], ['b', 'u', 'tter</w>']]
After merge 6: [['Betty', '</w>'], ['B', 'o', 'tter</w>'], ['h', 'a', 'd', '</w>'], ['s', 'o', 'm', 'e', '</w>'], ['b', 'u', 'tter</w>']]
After merge 7: [['Betty</w>'], ['B', 'o', 'tter</w>'], ['h', 'a', 'd', '</w>'], ['s', 'o', 'm', 'e',