# Merging Rules
- https://youtu.be/zduSFxRajkE?t=3463
- https://github.com/openai/gpt-2/blob/master/src/encoder.py
- https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py

In [2]:
import os; os.chdir('..')
import regex as re
from boring_utils.utils import *

import tiktoken

```
r"""'s|'t|'re|'ve|'m|'ll|'d| ?  匹配一些常见的英语缩略形式,如 's, 't, 're, 've, 'm, 'll, 'd 以及后面可选的空格
\p{L}+                          匹配任何Unicode字母字符的序列(如英语单词)
| ?                             或者一个可选的空格
\p{N}+                          匹配任何Unicode数字字符的序列(如123,3.14等) 
| ?                             或者一个可选的空格    
[^\s\p{L}\p{N}]+                匹配任何不是空白、字母或数字的字符序列(如标点符号、特殊字符等)
|                               或者
\s+(?!\S)                       匹配连续空白符(但后面不能紧跟非空白字符)
|                               或者    
\s+                             匹配任何其他连续空白符
"""
```


In [3]:
# NOTE: `'s|'t|'re|'ve|'m|'ll|'d` is a not good implementation
# fixed in GPT3
gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

In [6]:
cprint(re.findall(gpt2pat, "Hello've world123 how's HOW'S are you!!!?    "))

[93m<module> -> re.findall(gpt2pat, "Hello've world123 how's HOW'S are you!!!?    "):[0m
['Hello',
 "'ve",
 ' world',
 '123',
 ' how',
 "'s",
 ' HOW',
 "'",
 'S',
 ' are',
 ' you',
 '!!!?',
 '    ']


In [7]:
example = """
for i in range(1, 101):
    if i % 3 == 0 and i % 5 == 0:
        print("FizzBuzz")
    elif i % 3 == 0:
        print("Fizz")
    elif i % 5 == 0:
        print("Buzz")
    else:
        print(i)
"""
cprint(re.findall(gpt2pat, example))

[93m<module> -> re.findall(gpt2pat, example):[0m
['\n',
 'for',
 ' i',
 ' in',
 ' range',
 '(',
 '1',
 ',',
 ' 101',
 '):',
 '\n   ',
 ' if',
 ' i',
 ' %',
 ' 3',
 ' ==',
 ' 0',
 ' and',
 ' i',
 ' %',
 ' 5',
 ' ==',
 ' 0',
 ':',
 '\n       ',
 ' print',
 '("',
 'FizzBuzz',
 '")',
 '\n   ',
 ' elif',
 ' i',
 ' %',
 ' 3',
 ' ==',
 ' 0',
 ':',
 '\n       ',
 ' print',
 '("',
 'Fizz',
 '")',
 '\n   ',
 ' elif',
 ' i',
 ' %',
 ' 5',
 ' ==',
 ' 0',
 ':',
 '\n       ',
 ' print',
 '("',
 'Buzz',
 '")',
 '\n   ',
 ' else',
 ':',
 '\n       ',
 ' print',
 '(',
 'i',
 ')',
 '\n']


# EDA of GPT2's tokenizer

In [8]:
import os, json


# ~equivalent to our "vocab"
with open('./data/encoder.json', 'r') as f:
    encoder = json.load(f) 


# ~equivalent to our "merges"
with open('./data/vocab.bpe', 'r', encoding="utf-8") as f:
    bpe_data = f.read()


bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]

In [9]:
# 50257 = utf-8 (256) + 5k merges + 1 special tokens
cprint(len(encoder), encoder['<|endoftext|>'])

[93m<module> -> len(encoder):[0m
50257
[93m<module> -> encoder['<|endoftext|>']:[0m
50256
