In [1]:
import os
import sys

# 加入目前 notebook 檔案所在的目錄進入 sys.path
current_dir = os.path.dirname(os.path.abspath("__file__"))
if current_dir not in sys.path:
    sys.path.append(current_dir)

In [2]:
import re

def tokenize(text):
    """
        Simple tokenizer function
    """
    tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text) # 使用正則表達式分割文本
    
    result = [token for token in tokens if token.strip()] # 過濾掉空字符串
    
    return result

In [3]:
def token_to_id(tokens):
    """
        transform token to token id
    """
    all_tokens = sorted(list(set(tokens)))
    all_tokens.extend(["<|endoftext|>", "<|unk|>"]) # 添加特殊標記
    vocab = {token: idx for idx, token in enumerate(all_tokens)} # 建立詞彙表
    
    return vocab

In [4]:
# 測試
text = "Hello World. This is a test."

testToken = tokenize(text)
testVocab = token_to_id(testToken)

print(f"字元數:{testToken}")

for i, idx in enumerate(testVocab.items()):
    print(f"詞彙表索引: {i}, 詞彙: {idx[0]}, ID: {idx[1]}")

字元數:['Hello', 'World', '.', 'This', 'is', 'a', 'test', '.']
詞彙表索引: 0, 詞彙: ., ID: 0
詞彙表索引: 1, 詞彙: Hello, ID: 1
詞彙表索引: 2, 詞彙: This, ID: 2
詞彙表索引: 3, 詞彙: World, ID: 3
詞彙表索引: 4, 詞彙: a, ID: 4
詞彙表索引: 5, 詞彙: is, ID: 5
詞彙表索引: 6, 詞彙: test, ID: 6
詞彙表索引: 7, 詞彙: <|endoftext|>, ID: 7
詞彙表索引: 8, 詞彙: <|unk|>, ID: 8


In [5]:
file_path = 'the-verdict.txt'

with open(file_path, "r", encoding= "utf-8") as f:
    rawText = f.read()
    
preprocessedText = tokenize(rawText)

vocab = token_to_id(preprocessedText)

print(f"字元數: {len(rawText)}")
print(f"預處理後的字元數: {len(preprocessedText)}")
print(f"詞彙表大小: {len(vocab)}")

字元數: 20479
預處理後的字元數: 4690
詞彙表大小: 1132


In [None]:
from tokenizer import SimpleTokenizer
import tiktoken

tokenizer = SimpleTokenizer(vocab)
tiktokenizer = tiktoken.get_encoding("gpt2")

text = """
        "It's the last he painted, you know,"
        Mrs. Gisburn said with pardonable pride.
        """
text1 = "Hello, do you like tea?"   
text2 = "In the snlit terraces of the palace."
text_merged = " <|endoftext|> ".join((text1, text2))
print(text_merged)

ids = tokenizer.encode(text_merged)

print(ids)
print(tokenizer.decode(ids))
print("=================================")

ids = tiktokenizer.encode(text_merged, allowed_special= {"<|endoftext|>"})

print(ids)
print(tiktokenizer.decode(ids))

Hello, do you like tea? <|endoftext|> In the snlit terraces of the palace.
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 1131, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the <|unk|> terraces of the <|unk|>.
[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 3013, 18250, 8812, 2114, 286, 262, 20562, 13]
Hello, do you like tea? <|endoftext|> In the snlit terraces of the palace.
