In [1]:
from datasets import load_dataset, load_metric
import os
import sentencepiece as spm

In [2]:
translation_dataset = load_dataset("wmt14", "de-en", split="train[:1%]")
sentiment_dataset = load_dataset("amazon_polarity", split="train[:1%]")  # Just an example in English, replace with multilingual.

In [3]:

def extract_text_for_tokenizer(dataset, text_columns, sample_size=10000, translation_key=None):
    texts = []
    for i, example in enumerate(dataset):
        if i >= sample_size:
            break
        if translation_key and translation_key in example:
            # Extract text from a nested 'translation' dictionary
            combined_text = " ".join([example[translation_key][col] for col in text_columns])
        else:
            # Extract text directly if not nested
            combined_text = " ".join([example[col] for col in text_columns if col in example])
        texts.append(combined_text)
    return texts

translation_texts = extract_text_for_tokenizer(translation_dataset, ["en"], sample_size=10000, translation_key="translation")

# For sentiment datasets (e.g., amazon_polarity), texts might be directly in 'text' field
# If they are top-level, call without 'translation_key'.
sentiment_texts = extract_text_for_tokenizer(sentiment_dataset, ["content"], sample_size=10000)

all_texts = translation_texts + sentiment_texts

In [4]:
#Train BPE Tokenizer (Baseline)
from tokenizers import ByteLevelBPETokenizer

bpe_tokenizer = ByteLevelBPETokenizer()
bpe_tokenizer.train_from_iterator(all_texts, vocab_size=32000, min_frequency=2, special_tokens=["<s>", "</s>", "<pad>", "<unk>"])
os.makedirs("tokenizers/bpe", exist_ok=True)
bpe_tokenizer.save_model("tokenizers/bpe")







['tokenizers/bpe/vocab.json', 'tokenizers/bpe/merges.txt']

In [5]:
#Train SentencePiece (Unigram) Tokenizer

os.makedirs("sp_data", exist_ok=True)
with open("sp_data/corpus.txt", "w", encoding="utf-8") as f:
    for line in all_texts:
        f.write(line + "\n")

sp_model_prefix = "tokenizers/sp_unigram"
spm.SentencePieceTrainer.Train(
    f"--input=sp_data/corpus.txt --model_prefix={sp_model_prefix} "
    f"--vocab_size=16000 --model_type=unigram --character_coverage=1.0 "
    f"--unk_id=0 --pad_id=1 --bos_id=2 --eos_id=3"
)

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=sp_data/corpus.txt --model_prefix=tokenizers/sp_unigram --vocab_size=16000 --model_type=unigram --character_coverage=1.0 --unk_id=0 --pad_id=1 --bos_id=2 --eos_id=3
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: sp_data/corpus.txt
  input_format: 
  model_prefix: tokenizers/sp_unigram
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed

In [7]:
# WordPiece training

from tokenizers import BertWordPieceTokenizer

wp_tokenizer = BertWordPieceTokenizer(lowercase=True)
wp_tokenizer.train_from_iterator(all_texts, vocab_size=32000, limit_alphabet=1000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
os.makedirs("tokenizers/wp", exist_ok=True)
wp_tokenizer.save_model("tokenizers/wp")






['tokenizers/wp/vocab.txt']

In [8]:
# Convert all to Hugging Face format

from transformers import PreTrainedTokenizerFast
from transformers import BertTokenizerFast
from transformers import T5Tokenizer, T5TokenizerFast


bpe_tokenizer = ByteLevelBPETokenizer("tokenizers/bpe/vocab.json", "tokenizers/bpe/merges.txt")
# Wrap it with PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=bpe_tokenizer,
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<cls>",
    sep_token="<sep>",
    mask_token="<mask>"
)
wrapped_tokenizer.save_pretrained("tokenizers/bpe")



# Load as a slow tokenizer (this should work since it's a SentencePiece model)
slow_tokenizer = T5Tokenizer("tokenizers/sp_unigram.model", extra_ids=0)

# Save the slow tokenizer to a directory
slow_tokenizer.save_pretrained("tokenizers/sp_unigram_slow")

# Now convert slow to fast by loading it as a T5TokenizerFast
fast_tokenizer = T5TokenizerFast.from_pretrained("tokenizers/sp_unigram_slow")
fast_tokenizer.save_pretrained("tokenizers/sp_unigram_hf")



wordpiece_tokenizer = BertTokenizerFast(
    vocab_file="tokenizers/wp/vocab.txt",
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]"
)
wordpiece_tokenizer.save_pretrained("tokenizers/wp")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


('tokenizers/wp/tokenizer_config.json',
 'tokenizers/wp/special_tokens_map.json',
 'tokenizers/wp/vocab.txt',
 'tokenizers/wp/added_tokens.json',
 'tokenizers/wp/tokenizer.json')