# 草稿1

In [4]:
chr(0)

'\x00'

In [5]:
print(chr(0))

 


In [6]:
chr(0).__repr__()

"'\\x00'"

In [7]:
'this is a test' + chr(0) + 'string'

'this is a test\x00string'

In [8]:
print('this is a test' + chr(0) + 'string')

this is a test string


In [4]:
test_string = "hello! こんにちは!"
utf8_encoded = test_string.encode("utf-8")
print(utf8_encoded)
print(utf8_encoded.decode("utf-8"))
print(type(utf8_encoded))

b'hello! \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf!'
hello! こんにちは!
<class 'bytes'>


In [7]:
for i in range(len(test_string)):
    print(test_string[i], utf8_encoded[i])
    
print(list(utf8_encoded))

h 104
e 101
l 108
l 108
o 111
! 33
  32
こ 227
ん 129
に 147
ち 227
は 130
! 147
[104, 101, 108, 108, 111, 33, 32, 227, 129, 147, 227, 130, 147, 227, 129, 171, 227, 129, 161, 227, 129, 175, 33]


In [24]:
print(len(test_string))
print(len(utf8_encoded))

13
23


In [103]:
max([("A", "B"), ("A", "C"), ("B", "ZZ"), ("BA", "A")])

('BA', 'A')

In [None]:
import regex as re
PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
re.findall(PAT, "some text that i'll pre-tokenize")

['some', ' text', ' that', ' i', "'ll", ' pre', '-', 'tokenize']

In [None]:
# 下载数据并保存
from datasets import load_dataset
ds = load_dataset("roneneldan/TinyStories")
ds.save_to_disk('../data/TinyStories')

In [None]:
# 加载数据
from datasets import load_from_disk
dataset = load_from_disk('../data/TinyStories')

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

# 草稿2：课程资料上的代码

In [None]:
import os 
import sys
from abc import ABC
import regex as re
from collections import defaultdict
import psutil
from typing import BinaryIO

In [129]:
class Tokenizer(ABC):
    """分词器的抽象类，规范了分词器必须要有的方法: `encode()`、`decode()`"""
    def encode(self, string: str) -> list[int]:
        raise NotImplementedError

    def decode(self, indices: list[int]) -> str:
        raise NotImplementedError

class BPETokenizerParams:
    """定义一个 BPETokenizer 所需的全部内容。
    意思是有了这些参数：`vocab`、`merges`，就能构建一个 BPE 分词器。"""
    vocab: dict[int, bytes]     # index -> bytes
    merges: dict[tuple[int, int], int]  # index1,index2 -> new_index
    def __init__(self, vocab: dict[int, bytes], merges: dict[tuple[int, int], int]):
        self.vocab = vocab
        self.merges = merges

def merge(indices: list[int], pair: tuple[int, int], new_index: int) -> list[int]:
    """遍历所给的 `indices` 列表, 主要作用就是更新索引序列（或者说 Token 序列），
    把其中出现的指定 token 对 pair 生成一个新的 token"""
    new_indices = []
    i = 0
    while i < len(indices):
        # i + 1 < len(indices) 是用来保证 i 指向的是列表中的第二个 index
        # indices[i] == pair[0] and indices[i + 1] == pair[1] ：指定的 token 对 pair
        if i + 1 < len(indices) and indices[i] == pair[0] and indices[i + 1] == pair[1]:
            new_indices.append(new_index)
            i += 2
        else:
            # 没有被指定 pair 对的时候，将原来indices中的indice直接添加到new_indices
            new_indices.append(indices[i])  
            i += 1
    return new_indices


In [22]:
# merge() 举一个例子
indices = [1, 2, 3, 2, 3, 4, 2, 3]
pair = (2, 3)
new_index = 99

result = merge(indices, pair, new_index)
print(result)  # [1, 99, 99, 4]

[1, 99, 99, 4, 99]


In [112]:
class BPETokenizer(Tokenizer):
    """定义一个 BPETokenizer 类，继承了 Tokenizer 类，定义了两个方法：`encode()`, `decode()`"""
    def __init__(self, params: BPETokenizerParams):
        self.params = params
        
    def encode(self, string: str) -> list[int]:
        indices = list(map(int, string.encode("utf-8")))
        # Note: this is a very slow implementation
        for pair, new_index in self.params.merges.items():
            indices = merge(indices, pair, new_index)
        return indices
    
    def decode(self, indices: list[int]) -> str:
        bytes_list = list(map(self.params.vocab.get, indices))
        string = b"".join(bytes_list).decode("utf-8")
        return string

In [111]:
def train_bpe(string: str, num_merges: int) -> BPETokenizerParams:
    '''训练 bpe 分词器，
    string: 输入一段字符串
    num_merges: 指定进行几次合并
    返回类型是 BPETokenizerParams '''
    
    # 把字符串编码成 UTF-8 字节序列，再转成整型列表。
    # map() 的作用是：将 string.encode("utf-8")  的结果转换成整数类型
    indices = list(map(int, string.encode("utf-8")))
    
    # merges 定义数据类型是 dict[tuple[int, int], int]，记录的是每次 merge 对应的两个
    merges: dict[tuple[int, int], int] = {}
    
    # 词表 vocab 定义数据类型是 dict[int, bytes]，这里首先初始化了词表
    vocab: dict[int, bytes] = {x: bytes([x]) for x in range(256)}
    
    for i in range(num_merges):
        
        # 统计每一对 token 出现的次数
        # defaultdict(int) 的作用是加入字典的键不存在时，自动创建并计数为0
        counts = defaultdict(int)
        
        # 遍历生成相邻两个 token 的组合（index1, index2）
        for index1, index2 in zip(indices, indices[1:]):
            counts[(index1, index2)] += 1
        
        # 找到 counts 字典中，值最大的键。这里：如果有多个最大值，返回字典顺序下的第一个。
        pair = max(counts, key=counts.get)
        index1, index2 = pair
        
        # merges 更新
        new_index = 256 + i  # i 是从 0 开始的
        merges[pair] = new_index
        
        # 词表更新。两个字节类型的元素相加：不是数值相加，是两个字节拼接到一起
        vocab[new_index] = vocab[index1] + vocab[index2]
        
        # 更新索引序列
        indices = merge(indices, pair, new_index)
        
    return BPETokenizerParams(vocab=vocab, merges=merges)

# 草稿3：实现bpe

In [2]:
import os 
import sys
from abc import ABC
import regex as re
from collections import defaultdict
import psutil
from typing import BinaryIO

def memory():
    '''查看内存占用'''
    mem = psutil.virtual_memory()
    print(f"可用内存: {mem.available / 1024 / 1024:.2f} MB")
    print(f"内存使用率: {mem.percent}%")

In [72]:
def run_train_bpe(
    input_path: str | os.PathLike,
    vocab_size: int,
    special_tokens: list[str],
    **kwargs,
) -> tuple[dict[int, bytes], list[tuple[bytes, bytes]]]:
    """Given the path to an input corpus, run train a BPE tokenizer and
    output its vocabulary and merges.

    Args:
        input_path (str | os.PathLike): Path to BPE tokenizer training data.
        vocab_size (int): Total number of items in the tokenizer's vocabulary (including special tokens).
        special_tokens (list[str]): A list of string special tokens to be added to the tokenizer vocabulary.
            These strings will never be split into multiple tokens, and will always be
            kept as a single token. If these special tokens occur in the `input_path`,
            they are treated as any other string.

    Returns:
        tuple[dict[int, bytes], list[tuple[bytes, bytes]]]:
            vocab:
                The trained tokenizer vocabulary, a mapping from int (token ID in the vocabulary)
                to bytes (token bytes)
            merges:
                BPE merges. Each list item is a tuple of bytes (<token1>, <token2>),
                representing that <token1> was merged with <token2>.
                Merges are ordered by order of creation.
    """
    
    # 1. 词表初始化: 256个基础词、特殊 tokens
    # 词表 vocab 定义数据类型是 dict[int, bytes]，这里首先初始化了词表
    vocab: dict[int, bytes] = {x: bytes([x]) for x in range(256)}  # 256个基础词，0-255
    next_token_id = 256
    for special_token in special_tokens:  # 特殊 tokens 
        # 注意这里所给的特殊 tokens 为 str 格式，vocab 中的是字节形式的
        vocab[next_token_id] = special_token.encode("utf-8")
        next_token_id += 1
    
    # 2. 预分词
    # 读取数据
    with open(input_path, "r", encoding="utf-8") as f:
        content = f.read()
    
    # 预分词规则：gpt2 的分词规则，特殊的 token 使用单独的正则化
    special_pat = "|".join(re.escape(token) for token in special_tokens)
    PAT = rf"""{special_pat}| '(?:[sdmt]|ll|ve|re)| ?\p{{L}}+| ?\p{{N}}+| ?[^\s\p{{L}}\p{{N}}]+|\s+(?!\S)|\s+"""
    pre_token_matches = re.finditer(PAT, content)
    
    # 统计出现的次数
    pre_indices = defaultdict(int)
    for pre_token_matche in pre_token_matches:
        pre_indices_key = tuple(map(int, pre_token_matche.group().encode('utf-8')))
        pre_indices[pre_indices_key] += 1
    
    # 3. BPE 合并
    # 合并次数为词表大小
    num_merges = vocab_size - 256 - len(special_tokens)
    merges: list[tuple[bytes, bytes]] = []
    indices = pre_indices
    
    for i in range(num_merges):
        counts = defaultdict(int)
        # indice 字典中的 key
        for index in indices:
            # 生成相邻两个 token 的组合（index1, index2）
            for index1, index2 in zip(index, index[1:]):
                # indices[indice] 为合并前（index1, index2）出现的次数
                counts[(index1, index2)] += indices[index]
        # 找到出现次数最多的
        # pair = max(counts, key=counts.get)
        # 1. 先找出最大的值
        max_val = max(counts.values())
        # 2. 找出值等于最大值的所有键，然后max()找到字典序最大的合并
        pair = max([k for k, v in counts.items() if v == max_val])
        
        # merges 更新
        merges.append(pair)
        index1, index2 = pair
        
        # 词表更新。两个字节类型的元素相加：不是数值相加，是两个字节拼接到一起
        vocab[next_token_id] = vocab[index1] + vocab[index2]
        
        # 更新 indices 字典
        new_indices = defaultdict(int)
        for index in indices:
            index_value = indices[index]
            new_index = []
            i = 0
            while i < len(index):
                # i + 1 < len(index) 是用来保证 i 指向的是列表中的第二个 index
                # index[i] == pair[0] and index[i + 1] == pair[1] ：指定的 token 对 pair
                if i + 1 < len(index) and index[i] == pair[0] and index[i + 1] == pair[1]:
                    new_index.append(next_token_id)
                    i += 2
                else:
                    # 没有被指定 pair 对的时候，将原来index中的indice直接添加到new_index
                    new_index.append(index[i])  
                    i += 1
            new_indices[tuple(new_index)] = index_value

        # 这里才能将 next_token_id 加一，因为在更新 indices 字典时，会用到 next_token_id
        next_token_id += 1
        indices = new_indices

    return vocab, merges

In [77]:
vocab, merges = run_train_bpe('../data/TinyStoriesV2-GPT4-valid.txt', 300, ['<|endoftext|>'])

In [None]:
# 草稿

special_tokens = ['<|endoftext|>']

merges: list[tuple[bytes, bytes]] = []
vocab: dict[int, bytes] = {x: bytes([x]) for x in range(256)}
next_token_id = 256
for special_token in special_tokens:  # 特殊 tokens 
    # 注意这里所给的特殊 tokens 为 str 格式，vocab 中的是字节形式的
    vocab[next_token_id] = special_token.encode("utf-8")
    next_token_id += 1
    
with open('../data/text_example.txt', "r", encoding="utf-8") as f:
    content = f.read()[:10000]
    
pre_indices = defaultdict(int)

special_pat = "|".join(re.escape(token) for token in special_tokens)
PAT = rf"""{special_pat}| '(?:[sdmt]|ll|ve|re)| ?\p{{L}}+| ?\p{{N}}+| ?[^\s\p{{L}}\p{{N}}]+|\s+(?!\S)|\s+"""
pre_token_matches = re.finditer(PAT, content)

for pre_token_matche in pre_token_matches:
    pre_counts_key = tuple(map(int, pre_token_matche.group().encode('utf-8')))
    pre_indices[pre_counts_key] += 1


indices = pre_indices.copy()

In [None]:

counts = defaultdict(int)


# indice 字典中的 key
for indice in indices:
    # 生成相邻两个 token 的组合（index1, index2）
    for index1, index2 in zip(indice, indice[1:]):
        # indices[indice] 为合并前（index1, index2）出现的次数
        counts[(index1, index2)] += indices[indice]
print('counts:', dict(counts))
# 找到出现次数最多的
# pair = max(counts, key=counts.get)  # 这种方式不行！用下面的方法
# 1. 先找出最大的值
max_val = max(counts.values())
# 2. 找出值等于最大值的所有键，然后max()找到字典序最大的合并
pair = max([k for k, v in counts.items() if v == max_val])

# 添加到 merges 中
merges.append(pair)
index1, index2 = pair
print('本次需要合并的 token：', pair)
print('更新前next_token_id:', next_token_id)


# 词表更新。两个字节类型的元素相加：不是数值相加，是两个字节拼接到一起
# print(vocab[next_token_id])
vocab[next_token_id] = vocab[index1] + vocab[index2]

print('更新后next_token_id:', next_token_id)
print('词表的大小：', len(vocab))


# 更新 indices 字典
new_indices = defaultdict(int)
for index in indices:
    index_value = indices[index]
    new_index = []
    i = 0
    while i < len(index):
        # i + 1 < len(index) 是用来保证 i 指向的是列表中的第二个 index
        # index[i] == pair[0] and index[i + 1] == pair[1] ：指定的 token 对 pair
        if i + 1 < len(index) and index[i] == pair[0] and index[i + 1] == pair[1]:
            new_index.append(next_token_id)
            i += 2
        else:
            # 没有被指定 pair 对的时候，将原来index中的indice直接添加到new_index
            new_index.append(index[i])  
            i += 1
    new_indices[tuple(new_index)] = index_value

# 这里才能将 next_token_id 加一，因为在更新 indices 字典时，会用到 next_token_id
next_token_id += 1
indices = new_indices
vocab

counts: {(108, 111): 70, (111, 119): 70, (32, 108): 50, (119, 101): 80, (101, 114): 20, (32, 119): 30, (119, 105): 30, (105, 100): 30, (100, 101): 30, (101, 257): 90, (110, 101): 60, (101, 119): 60, (32, 110): 50, (60, 124): 10, (124, 101): 10, (101, 110): 10, (110, 100): 10, (100, 111): 10, (111, 102): 10, (102, 116): 10, (116, 101): 10, (101, 120): 10, (120, 116): 10, (116, 124): 10, (124, 62): 10}
本次需要合并的 token： (101, 257)
更新前next_token_id: 258
更新后next_token_id: 258
词表的大小： 259


{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [55]:
counts

defaultdict(int,
            {(108, 111): 70,
             (111, 119): 70,
             (32, 108): 50,
             (119, 101): 80,
             (101, 114): 20,
             (32, 119): 30,
             (119, 105): 30,
             (105, 100): 30,
             (100, 101): 30,
             (101, 115): 90,
             (115, 116): 90,
             (110, 101): 60,
             (101, 119): 60,
             (32, 110): 50,
             (60, 124): 10,
             (124, 101): 10,
             (101, 110): 10,
             (110, 100): 10,
             (100, 111): 10,
             (111, 102): 10,
             (102, 116): 10,
             (116, 101): 10,
             (101, 120): 10,
             (120, 116): 10,
             (116, 124): 10,
             (124, 62): 10})

In [None]:
pre_indices[(105, 110)]

3

In [56]:
data = {
    (108, 111): 70,
    (109, 110): 70,
    (100, 200): 60,
    (110, 105): 70
}

# 1. 找出最大值
max_val = max(data.values())

# 2. 找出值等于最大值的所有键
candidates = [k for k, v in data.items() if v == max_val]

# 3. 找出键最大的那个元组
max_key = max(candidates)

# 4. 得到最终结果
result = (max_key, max_val)
print(result)


((110, 105), 70)


In [58]:
max([k for k, v in data.items() if v == max_val])

(110, 105)

In [57]:
candidates

[(108, 111), (109, 110), (110, 105)]

In [176]:
memory()

可用内存: 3132.28 MB
内存使用率: 77.9%


In [213]:
counts

defaultdict(int,
            {(108, 111): 70,
             (111, 119): 70,
             (32, 108): 50,
             (119, 101): 80,
             (101, 114): 20,
             (32, 119): 30,
             (119, 105): 30,
             (105, 100): 30,
             (100, 101): 30,
             (101, 115): 90,
             (115, 116): 90,
             (110, 101): 60,
             (101, 119): 60,
             (32, 110): 50,
             (60, 124): 10,
             (124, 101): 10,
             (101, 110): 10,
             (110, 100): 10,
             (100, 111): 10,
             (111, 102): 10,
             (102, 116): 10,
             (116, 101): 10,
             (101, 120): 10,
             (120, 116): 10,
             (116, 124): 10,
             (124, 62): 10})

In [212]:
max(counts, key=counts.get)

(101, 115)