In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast

In [ ]:
# Initialize a WordPiece tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

# Set a pre-tokenizer (splits text into whitespace-separated tokens)
tokenizer.pre_tokenizer = Whitespace()

# Define a trainer for building the vocabulary
trainer = WordPieceTrainer(
    vocab_size=5000,  # Adjust size to your needs
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

# Train the tokenizer on your text data
texts = [
    "This is a sample text.",
    "Creating a custom tokenizer can be useful.",
    "We are defining our own vocabulary!"
]

# Tokenizer expects text files, so write texts to a temporary file
with open("data.txt", "w") as f:
    f.write("\n".join(texts))

tokenizer.train(files=["data.txt"], trainer=trainer)

# Wrap the tokenizer for use in Hugging Face
hf_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)

# Save as Hugging Face tokenizer
hf_tokenizer.save_pretrained('custom_tokenizer')

In [ ]:
# Example text
example_text = "I am learning how to tokenize text!"

# Tokenize the text
encoded = hf_tokenizer(example_text, return_tensors="pt", padding=True, truncation=True)

# Print the tokenized output
print(encoded)

decoded_text = hf_tokenizer.decode(encoded['input_ids'][0])
print(decoded_text)