In [24]:
from tokenizer import Tokenizer

def main():
    # Reading the text provided
    input_path = "manual.txt"
    with open(input_path, "r", encoding="utf-8") as f:
        original = f.read()
    print("The original text is:")
    print('-----------------')
    print(original[:100])
    print('-----------------')

    # Train
    tok = Tokenizer()
    tok.train(original, vocab_size=1024)

    # Encode & Decode
    ids = tok.encode(original)
    decoded = tok.decode(ids)
    print("The decoded text is:")
    print('-----------------')
    print(decoded[:100])
    print('-----------------')

    # Check the coherence
    if decoded == original:
        print("The text decoded is the same as the origin.")
    else:
        print("The text decoded is different from the origin.")
        # To view the detail:
        import difflib
        diff = difflib.unified_diff(
            original.splitlines(True),
            decoded.splitlines(True),
            fromfile="original",
            tofile="decoded",
        )
        print("".join(diff))

if __name__ == "__main__":
    main()


The original text is:
-----------------
北 京 大 学
PEKING UNIVERSITY
北 京 ⼤ 学 研 究 ⽣ ⼿ 册
（ 2 0 2 3 版 ）
北 京 ⼤ 学 研 究 ⽣ 院
2 0 2 3 年 8 ⽉
习近平对研究生教育工作作
-----------------
The decoded text is:
-----------------
北 京 大 学
PEKING UNIVERSITY
北 京 ⼤ 学 研 究 ⽣ ⼿ 册
（ 2 0 2 3 版 ）
北 京 ⼤ 学 研 究 ⽣ 院
2 0 2 3 年 8 ⽉
习近平对研究生教育工作作
-----------------
The text decoded is the same as the origin.


In [25]:
from transformers import GPT2Tokenizer
from tokenizer import Tokenizer

# The sentence needed to be encoded
sentence = "Originated as the Imperial University of Peking in 1898, Peking University was China’s first national comprehensive university and the supreme education authority at the time. Since the founding of the People’s Republic of China in 1949, it has developed into a comprehensive university with fundamental education and research in both humanities and science. The reform and opening-up of China in 1978 has ushered in a new era for the University unseen in history. And its merger with Beijing Medical University in 2000 has geared itself up for all-round and vibrant growth in such fields as science, engineering, medicine, agriculture, humanities and social sciences. Supported by the “211 Project” and the “985 Project”, the University has made remarkable achievements, such as optimizing disciplines, cultivating talents, recruiting high-caliber teachers, as well as teaching and scientific research, which paves the way for a world-class university."
print(f"Original sentence: {sentence}")
print("--------------------")

# loading tokenizer from GPT-2
GPT2_tokenizer = GPT2Tokenizer.from_pretrained("../gpt2")

GPT2_encoded = GPT2_tokenizer.encode(sentence)
print(f"GPT-2 Token IDs: {GPT2_encoded}")
print("--------------------")

# loading tokenizer from manual tokenizer
input_path = "manual.txt"
with open(input_path, "r", encoding="utf-8") as f:
    original = f.read()

# Train manual tokenizer
BPE_tokenizer = Tokenizer()
BPE_tokenizer.train(original, vocab_size=1024)

BPE_encoded = BPE_tokenizer.encode(sentence)
print(f"BPE Manual Token IDs: {BPE_encoded}")


Original sentence: Originated as the Imperial University of Peking in 1898, Peking University was China’s first national comprehensive university and the supreme education authority at the time. Since the founding of the People’s Republic of China in 1949, it has developed into a comprehensive university with fundamental education and research in both humanities and science. The reform and opening-up of China in 1978 has ushered in a new era for the University unseen in history. And its merger with Beijing Medical University in 2000 has geared itself up for all-round and vibrant growth in such fields as science, engineering, medicine, agriculture, humanities and social sciences. Supported by the “211 Project” and the “985 Project”, the University has made remarkable achievements, such as optimizing disciplines, cultivating talents, recruiting high-caliber teachers, as well as teaching and scientific research, which paves the way for a world-class university.
--------------------
GPT-2 

In [26]:
from transformers import GPT2Tokenizer
from tokenizer import Tokenizer

# The sentence needed to be encoded
sentence = "博士学位论文应当表明作者具有独立从事科学研究工作的能力，并在科学或专门技术上做出创造性的成果。博士学位论文或摘要，应当在答辩前三个月印送有关单位，并经同行评议。学位授予单位应当聘请两位与论文有关学科的专家评阅论文，其中一位应当是外单位的专家。评阅人应当对论文写详细的学术评语，供论文答辩委员会参考。"
print(f"Original sentence: {sentence}")
print("--------------------")

# loading tokenizer from GPT-2
GPT2_tokenizer = GPT2Tokenizer.from_pretrained("../gpt2")

GPT2_encoded = GPT2_tokenizer.encode(sentence)
print(f"GPT-2 Token IDs: {GPT2_encoded}")
print("--------------------")

# loading tokenizer from manual tokenizer
input_path = "manual.txt"
with open(input_path, "r", encoding="utf-8") as f:
    original = f.read()

# Train manual tokenizer
BPE_tokenizer = Tokenizer()
BPE_tokenizer.train(original, vocab_size=1024)

BPE_encoded = BPE_tokenizer.encode(sentence)
print(f"BPE Manual Token IDs: {BPE_encoded}")


Original sentence: 博士学位论文应当表明作者具有独立从事科学研究工作的能力，并在科学或专门技术上做出创造性的成果。博士学位论文或摘要，应当在答辩前三个月印送有关单位，并经同行评议。学位授予单位应当聘请两位与论文有关学科的专家评阅论文，其中一位应当是外单位的专家。评阅人应当对论文写详细的学术评语，供论文答辩委员会参考。
--------------------
GPT-2 Token IDs: [39355, 248, 18803, 27764, 99, 19526, 235, 164, 106, 118, 23877, 229, 41753, 242, 37605, 241, 26193, 101, 23626, 236, 43291, 38519, 17739, 115, 17312, 231, 45379, 105, 44165, 233, 20015, 236, 12859, 233, 163, 100, 239, 27764, 99, 163, 254, 242, 163, 102, 114, 32432, 98, 43291, 21410, 47797, 121, 27950, 249, 171, 120, 234, 33176, 114, 28839, 101, 163, 100, 239, 27764, 99, 22755, 244, 10310, 241, 29785, 101, 162, 232, 222, 17312, 107, 41468, 161, 223, 248, 49035, 118, 26344, 249, 34460, 254, 45250, 100, 21410, 22755, 238, 162, 252, 250, 16764, 39355, 248, 18803, 27764, 99, 19526, 235, 164, 106, 118, 23877, 229, 22755, 244, 162, 239, 246, 17358, 223, 171, 120, 234, 41753, 242, 37605, 241, 28839, 101, 163, 18433, 164, 122, 102, 30298, 235, 49011, 10310, 103, 17312, 230, 39355, 108, 3446