In [None]:
import sentencepiece as spm

In [None]:
import sentencepiece as spm
import os

# Configuration for training the SentencePiece model
# SentencePiece allows for subword tokenization, which helps handle out-of-vocabulary words.
options = {
    # The source text file used for learning the vocabulary
    'input': 'train_data.txt',
    # The base name for the generated model (.model) and vocabulary (.vocab) files
    'model_prefix': 'bpe_model',
    # Number of unique tokens in the final vocabulary
    'vocab_size': 4000,
    # 'bpe' (Byte Pair Encoding) merges frequent pairs of characters/sequences
    'model_type': 'bpe',
    # Percentage of characters covered by the model; 0.9995 is standard for languages with large character sets
    'character_coverage': 0.9995,
    # When enabled, unknown characters are decomposed into UTF-8 bytes to avoid 'unk' tokens
    'byte_fallback': True,
    # Treats digits individually (0-9), preventing large numbers from being treated as single tokens
    'split_digits': True,
    # Prevents adding a whitespace prefix to the first token; useful for fine-grained control
    'add_dummy_prefix': False
}

try:
    print("Starting the training process...")
    # SentencePieceTrainer.train takes the dictionary of options to build the BPE model
    spm.SentencePieceTrainer.train(**options)
    print("Training complete. 'bpe_model.model' and 'bpe_model.vocab' have been created.")

    # Initialize the processor and load the newly trained model
    sp = spm.SentencePieceProcessor()
    sp.load('bpe_model.model')
    
    print("-" * 30)
    print("Model Metadata:")
    # Retrieve the total number of tokens in the vocabulary
    print(f"Total Vocab Size: {sp.get_piece_size()}")
    # Special tokens are used for sequence boundaries and handling unknown characters
    print(f"BOS (Beginning of Sentence) ID: {sp.bos_id()}")
    print(f"EOS (End of Sentence) ID:       {sp.eos_id()}")
    print(f"UNK (Unknown) ID:              {sp.unk_id()}")
    print(f"PAD (Padding) ID:              {sp.pad_id()}")
    
    # Test the tokenizer on sample strings to see how it breaks down text
    test_sentences = [
        'Hello World! 1234567890', 
        'This blog is the most uninstagrammable blog ever'
    ]

    for text in test_sentences:
        print("\n--- Tokenization Test ---")
        print(f"Original Text: {text}")
        # encode_as_pieces: shows the actual subword units (tokens)
        print(f"Subword Tokens: {sp.encode_as_pieces(text)}")
        # encode_as_ids: shows the numerical mapping for each token
        print(f"Numerical IDs:  {sp.encode_as_ids(text)}")

except Exception as e:
    print(f"An error occurred during training or processing: {e}")

In [9]:
print("\n" + "="*40 + "\n")
print("Training Unigram Model (SentencePiece Default)")
# Note: SentencePiece doesn't strictly have a 'wordpiece' model_type. 
# It supports 'unigram', 'bpe', 'char', and 'word'.
# 'Unigram' is the default and usually recommended over BPE in SentencePiece.
# Configuration for the Unigram model training.
# Unigram is a probabilistic subword tokenization method that starts with a large vocabulary 
# and iteratively removes tokens that minimize the loss of the likelihood of the training data.
options_unigram = {
    'input': 'train_data.txt',        # Path to the raw text file for training
    'model_prefix': 'unigram_model',  # Prefix for the output .model and .vocab files
    'vocab_size': 1200,               # Desired size of the final vocabulary
    'model_type': 'unigram',          # Specifies the Unigram language model algorithm
    'character_coverage': 0.9995,     # Percentage of characters covered by the model (0.9995 is standard for Latin scripts)
    'byte_fallback': True,            # Enables mapping unknown characters to UTF-8 bytes to avoid <unk> tokens
    'split_digits': True,             # Treats each digit as an individual token (useful for numerical data)
    'add_dummy_prefix': False         # Prevents adding a leading space (SentencePiece default is True)
}

try:
    # 1. Train the SentencePiece model using the defined options
    print("Starting Unigram training...")
    spm.SentencePieceTrainer.train(**options_unigram)
    print("Training complete. 'unigram_model.model' created.")

    # 2. Load the trained model into a processor instance for inference
    sp_unigram = spm.SentencePieceProcessor()
    sp_unigram.load('unigram_model.model')
    
    print("-" * 30)
    print("Unigram Model Metadata:")
    print(f"Total Vocab Size: {sp_unigram.get_piece_size()}")
    
    # 3. Define test cases to evaluate how the model handles common and rare words
    test_sentences = [
        'Hello World! 1234567890', 
        'This blog is the most uninstagrammable blog ever'
    ]

    # 4. Iterate through test sentences to visualize subword segmentation
    for text in test_sentences:
        print("\n--- Tokenization Test (Unigram) ---")
        print(f"Original Text: {text}")
        # encode_as_pieces: Converts text into subword strings (visual representation)
        print(f"Subword Tokens: {sp_unigram.encode_as_pieces(text)}")
        # encode_as_ids: Converts text into numerical indices for model input
        print(f"Numerical IDs:  {sp_unigram.encode_as_ids(text)}")

except Exception as e:
    # Handle potential errors during training or loading (e.g., missing input file)
    print(f"An error occurred with Unigram model: {e}")





Training Unigram Model (SentencePiece Default)
Starting Unigram training...
Training complete. 'unigram_model.model' created.
------------------------------
Unigram Model Metadata:
Total Vocab Size: 1200

--- Tokenization Test (Unigram) ---
Original Text: Hello World! 1234567890
Subword Tokens: ['<0x48>', 'e', 'll', 'o', '▁', 'W', 'or', 'ld', '<0x21>', '▁', '1', '2', '3', '4', '5', '<0x36>', '7', '8', '<0x39>', '0']
Numerical IDs:  [75, 268, 363, 340, 259, 473, 380, 1020, 36, 259, 283, 277, 536, 323, 348, 57, 316, 319, 60, 311]

--- Tokenization Test (Unigram) ---
Original Text: This blog is the most uninstagrammable blog ever
Subword Tokens: ['T', 'his', '▁b', 'l', 'o', 'g', '▁is', '▁the', '▁m', 'o', 'st', '▁un', 'in', 'sta', 'g', 'ra', 'm', 'm', 'able', '▁b', 'l', 'o', 'g', '▁', 'e', 'ver']
Numerical IDs:  [346, 1121, 344, 381, 340, 365, 293, 269, 811, 340, 562, 754, 815, 828, 365, 678, 388, 388, 412, 344, 381, 340, 365, 259, 268, 1128]


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: train_data.txt
  input_format: 
  model_prefix: unigram_model
  model_type: UNIGRAM
  vocab_size: 1200
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0


In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

try:
    # 1. Initialize the WordPiece Tokenizer
    # We specify the [UNK] token for handling words not found in the vocabulary.
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    
    # 2. Configure Pre-tokenization
    # Before the subword algorithm runs, we need to split the raw text into words.
    # Whitespace splitting is the standard first step for most English NLP tasks.
    tokenizer.pre_tokenizer = Whitespace()
    
    # 3. Initialize the Trainer
    # We define our target vocabulary size and the special tokens required for 
    # downstream tasks (like BERT's [CLS] for classification or [SEP] for separators).
    trainer = WordPieceTrainer(
        vocab_size=4000, 
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
    
    # 4. Train the Model
    # The tokenizer scans the training file to build a vocabulary of the most 
    # frequent subword units.
    tokenizer.train(files=["train_data.txt"], trainer=trainer)
    
    # 5. Persist the Model
    # Save the configuration and vocabulary to a JSON file for future inference.
    tokenizer.save("wordpiece.json")
    print("Training complete. 'wordpiece.json' created.")
    
    # 6. Metadata Inspection
    print("-" * 30)
    print("WordPiece Model Metadata:")
    print(f"Total Vocab Size: {tokenizer.get_vocab_size()}")
    
    # 7. Testing Subword Tokenization
    # WordPiece shines at handling rare words by breaking them into meaningful chunks.
    test_sentences = [
        'Hello World! 1234567890', 
        'This blog is the most uninstagrammable blog ever'
    ]

    for text in test_sentences:
        print("\n--- Tokenization Test (WordPiece) ---")
        print(f"Original Text: {text}")
        
        # Encode converts raw text into a Tokenizer object containing tokens and IDs
        output = tokenizer.encode(text)
        
        # 'tokens' shows the subword breakdown (e.g., 'un', '##insta', etc.)
        print(f"Subword Tokens: {output.tokens}")
        # 'ids' are the numerical indices mapped to the vocabulary
        print(f"Numerical IDs:  {output.ids}")

except Exception as e:
    print(f"An error occurred with WordPiece model: {e}")




Training complete. 'wordpiece.json' created.
------------------------------
WordPiece Model Metadata:
Total Vocab Size: 2609

--- Tokenization Test (WordPiece) ---
Original Text: Hello World! 1234567890
Subword Tokens: ['H', '##el', '##lo', 'W', '##or', '##ld', '[UNK]', '[UNK]']
Numerical IDs:  [37, 180, 214, 52, 162, 418, 0, 0]

--- Tokenization Test (WordPiece) ---
Original Text: This blog is the most uninstagrammable blog ever
Subword Tokens: ['This', 'b', '##lo', '##g', 'is', 'the', 'm', '##os', '##t', 'un', '##ins', '##ta', '##g', '##ra', '##m', '##ma', '##ble', 'b', '##lo', '##g', 'ever']
Numerical IDs:  [691, 58, 214, 102, 248, 194, 69, 660, 96, 875, 350, 209, 102, 155, 108, 173, 510, 58, 214, 102, 1240]
