In [None]:
from transformers import AutoTokenizer

# Hugging Face에서 사전학습된 모델에 맞는 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")

# 입력 텍스트 토크나이즈
text = "This is a test sentence."
tokenized = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
print(tokenized)

In [1]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from tokenizers)
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.27.1-py3-none-any.whl (450 kB)
Installing collected packages: huggingface-hub, tokenizers
Successfully installed huggingface-hub-0.27.1 tokenizers-0.21.0
[0m

In [4]:
import sys, os
#sys.path.append("../src")
#import info

from tokenizers import Tokenizer, normalizers
from tokenizers.models import BPE
from tokenizers.normalizers import NFD, StripAccents
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.decoders import BPEDecoder
from tokenizers.processors import TemplateProcessing

files_path = corpus_files = ["/workspace/Transformer/dataset9/train.en", "/workspace/Transformer/dataset9/train.de"]

tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(vocab_size=37000, special_tokens=["<s>", "</s>", "<pad>", "<unk>","##AT##-##AT##"], end_of_word_suffix="</w>", show_progress=True)
tokenizer.train(files=files_path, trainer=trainer)

# tokenizer.post_processor = TemplateProcessing(
#     single="<s> $A </s>",
#     pair="<s> $A </s> $B:1 </s>:1",
#     special_tokens=[
#         ("<s>", tokenizer.token_to_id("<s>")),
#         ("</s>", tokenizer.token_to_id("</s>")),
#     ],
# )
tokenizer.decoder = BPEDecoder()
# tokenizer.enable_padding(pad_token="<pad>", length=128)

tokenizer.save("/workspace/Transformer/NEW_Tokenizer/ende_WMT14_Tokenizer.json")

output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)
print(tokenizer.decode(output.ids))




['H', 'ello</w>', ',</w>', 'y</w>', "'</w>", 'all</w>', '!</w>', 'How</w>', 'are</w>', 'you</w>', '<unk>', '?</w>']
Hello , y ' all ! How are you ?


In [6]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

# 1. BPE 모델 생성
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))

# 2. PreTokenizer 설정
# 각 단어를 공백 기준으로 나눈 뒤 문자 단위로 분리
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# 3. 학습 설정
trainer = trainers.BpeTrainer(
    vocab_size=37000,         # 어휘 사전 크기
    min_frequency= 2,          # 최소 병합 빈도
    special_tokens=["<s>", "</s>", "<pad>", "<unk>"]  # 특수 토큰
)

# 4. 학습용 말뭉치 파일 준비
corpus_files = ["/home/user15/RNN/dataset5/newstest2014.en", "/home/user15/RNN/dataset5/newstest2014.de"]

corpus_files = ["/workspace/Transformer/dataset6/train.en"]



# 5. 토크나이저 학습
print("Training BPE Tokenizer...")
tokenizer.train(corpus_files, trainer)


# 6. 학습된 토크나이저 저장 (JSON 형식)
tokenizer.save("./bpe_tokenizer2/tokenizer.en.json")
print("Tokenizer saved to ./bpe_tokenizer2/tokenizer.en.json")




Training BPE Tokenizer...



Tokenizer saved to ./bpe_tokenizer2/tokenizer.en.json


In [1]:
import json

# 저장된 JSON 파일 로드
with open("./bpe_tokenizer/tokenizer.json", "r") as f:
    tokenizer_data = json.load(f)

# Vocab에서 <pad> 토큰 확인
pad_index = tokenizer_data["model"]["vocab"].get("<pad>")
print(f"Pad token index: {pad_index}")

Pad token index: 2


In [7]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

# 9. 저장된 토크나이저 불러오기
print("\nLoading saved tokenizer...")
loaded_tokenizer = Tokenizer.from_file("./bpe_tokenizer/tokenizer.json")

# 10. 불러온 토크나이저로 테스트

encoded_loaded = loaded_tokenizer.encode(text)

# 11. 디코딩 테스트
decoded = loaded_tokenizer.decode(encoded_loaded.ids)
decoded = decoded # .replace("Ġ", " ") # .strip()
print("Decoded text:", decoded)


Loading saved tokenizer...

Testing loaded tokenizer:
Input text: hello world
Token IDs: [577, 13137, 1263]
Tokens: ['Ġhe', 'llo', 'Ġworld']
Decoded text: Ġhe llo Ġworld


In [None]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

# 1. 학습용 말뭉치 파일 준비 (예: corpus.txt)
# corpus.txt 파일은 한 줄에 하나의 문장 등으로 구성된 텍스트 데이터를 담고 있어야 합니다.
# 아래는 예시 경로이며, 실제 파일 경로로 변경하세요.
corpus_files = ["corpus.txt"]

# 2. ByteLevel BPE 토크나이저 객체 생성
tokenizer = ByteLevelBPETokenizer()

# 3. 토크나이저 학습 수행
# vocab_size: 어휘 사전 크기 지정
# min_frequency: 단어 병합 최소 빈도
# special_tokens: 모델에 필요한 특수 토큰 지정
tokenizer.train(files=corpus_files, vocab_size=32000, min_frequency=2, special_tokens=[
    "<s>",
    "</s>",
    "<pad>",
    "<unk>"
])

# 4. 학습된 토크나이저를 저장
tokenizer.save_model("./bpe_tokenizer")

# 5. 저장된 토크나이저 로드 (추후 재사용을 위해)
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer(
    "./bpe_tokenizer/vocab.json",
    "./bpe_tokenizer/merges.txt",
)

# 6. 특수 토큰 처리 (옵션)
#   BERT와 호환되는 형식으로 인풋을 만들기 위해 Processors 설정 (여기서는 <s>, </s> 사용)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>"))
)
tokenizer.enable_truncation(max_length=128)

# 7. 토큰화 예제
text = "이 문장은 번역 모델 테스트를 위한 예제 문장입니다."

# 토큰화
encoded = tokenizer.encode(text)
print("토큰 ID:", encoded.ids)
print("토큰 리스트:", encoded.tokens)

# 8. 디코딩 (복원)
decoded = tokenizer.decode(encoded.ids)
print("복원된 문장:", decoded)