In [43]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [44]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [45]:
from collections import defaultdict

word_freqs = defaultdict(int)
for text in corpus:
    word_offset_list = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    words = [word for word, offset in word_offset_list]
    for word in words:
        word_freqs[word] += 1
print(word_freqs)

defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})


In [46]:
base_vocab = []
for word, freqs in word_freqs.items():
    for char in word:
        if char not in base_vocab:
            base_vocab.append(char)

base_vocab.sort()

In [47]:
vocab = ["<|startoftext|>", "<|endoftext|>"] + base_vocab.copy()
splits = {word: [c for c in word] for word in word_freqs.keys()}

def compute_pair_freqs(splits):
    pairs = defaultdict(int)
    for word, split in splits.items():
        if len(word)==1:
            continue
        else:
            for i in range(len(split)-1):
                pairs[(split[i], split[i+1])] += 1
    return pairs

pair_freqs = compute_pair_freqs(splits)

In [48]:
max_freq = 0
best_pair = ""
for pair, freq in pair_freqs.items():
    if freq > max_freq:
        best_pair = pair[0]+pair[1]
        max_freq = freq
best_pair, max_freq

('Ġt', 7)

In [49]:
vocab.append("Ġt")

In [50]:
def merge_pair(a, b, split):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i=0
        while i<len(split)-1:
            if split[i] == a and split[i+1] == b:
                split = split[:i] + [a+b] + split[i+2:]
            else:
                i += 1
        splits[word] = split
    return splits

splits = merge_pair("Ġ", "t", splits)
print(splits["Ġtrained"])

['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']


In [65]:
path = "/Data/deeksha/disha/code_p/transformers/train.en"
with open(path, "r") as f:
    corpus = f.readlines()
corpus = [line.strip() for line in corpus]
corpus[:5]

['Two young, White males are outside near many bushes.',
 'Several men in hard hats are operating a giant pulley system.',
 'A little girl climbing into a wooden playhouse.',
 'A man in a blue shirt is standing on a ladder cleaning a window.',
 'Two men are at the stove preparing food.']

In [67]:
from transformers import AutoTokenizer
from collections import defaultdict


vocab_size = 10000
merges = defaultdict(list)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

word_freqs = defaultdict(int)
for text in corpus:
    word_offset_list = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    words = [word for word, offset in word_offset_list]
    for word in words:
        word_freqs[word] += 1
print(word_freqs)

base_vocab = []
for word, freqs in word_freqs.items():
    for char in word:
        if char not in base_vocab:
            base_vocab.append(char)

base_vocab.sort()

vocab = ["<|startoftext|>", "<|endoftext|>"] + base_vocab.copy()
splits = {word: [c for c in word] for word in word_freqs.keys()}

def compute_pair_freqs(splits):
    pairs = defaultdict(int)
    for word, split in splits.items():
        if len(word)==1:
            continue
        else:
            for i in range(len(split)-1):
                pairs[(split[i], split[i+1])] += 1
    return pairs

pair_freqs = compute_pair_freqs(splits)

def merge_pair(a, b, split):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i=0
        while i<len(split)-1:
            if split[i] == a and split[i+1] == b:
                split = split[:i] + [a+b] + split[i+2:]
            else:
                i += 1
        splits[word] = split
    return splits

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = 0
    for pair, freq in pair_freqs.items():
        if max_freq is not None and freq > max_freq:
            best_pair = (pair[0], pair[1])
            max_freq = freq
    splits = merge_pair(best_pair[0], best_pair[1], splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    best_pair = best_pair[0] + best_pair[1]
    vocab.append(best_pair)



In [68]:
merges

defaultdict(list,
            {('i', 'n'): 'in',
             ('e', 'r'): 'er',
             ('Ġ', 's'): 'Ġs',
             ('in', 'g'): 'ing',
             ('e', 's'): 'es',
             ('Ġ', 'c'): 'Ġc',
             ('e', 'd'): 'ed',
             ('a', 'r'): 'ar',
             ('a', 'n'): 'an',
             ('Ġ', 'p'): 'Ġp',
             ('o', 'n'): 'on',
             ('a', 't'): 'at',
             ('Ġ', 'b'): 'Ġb',
             ('e', 'n'): 'en',
             ('o', 'r'): 'or',
             ('Ġ', 't'): 'Ġt',
             ('a', 'l'): 'al',
             ('l', 'e'): 'le',
             ('Ġ', 'f'): 'Ġf',
             ('r', 'e'): 're',
             ('Ġ', 'd'): 'Ġd',
             ('Ġ', 'm'): 'Ġm',
             ('r', 'o'): 'ro',
             ('s', 't'): 'st',
             ('i', 'c'): 'ic',
             ('e', 'l'): 'el',
             ('Ġ', 'h'): 'Ġh',
             ('a', 'c'): 'ac',
             ('i', 't'): 'it',
             ('er', 's'): 'ers',
             ('u', 'n'): 'un',
             ('Ġ'

In [69]:
len(merges)

9918

In [70]:
def tokenize(text):
    tokenize_result = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    tokenized_text = [word for word, offset in tokenize_result]
    splits = [[c for c in word] for word in tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split)-1:
                if split[i] == pair[0] and split[i+1] == pair[1]:
                    split = split[:i] + [merge] + split[i+2:]
                else:
                    i += 1
            splits[idx] = split
    return sum(splits, [])

tokenize("This is not a token ϶.")

['This', 'Ġis', 'Ġnot', 'Ġa', 'Ġto', 'ken', 'Ġ', 'Ï', '¶', '.']