In [1]:
import os

# -----------------------
# Configuration
# -----------------------
corpus_file = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/corpus/mergedcorpus.txt"   # your input text file (one transcript per line)
output_dir = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer_bpe"          # where to save tokenizer files
vocab_size = 10000                  # adjust (8k–15k typical for Tibetan ASR)

os.makedirs(output_dir, exist_ok=True)



In [2]:
#
# train_bpe_tokenizer.py
#

import os
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Sequence, Whitespace, CharDelimiterSplit, Split

print(f"✅ Created a sample corpus file at: {corpus_file}")

files = [corpus_file]

# --- 2. Initialize the Tokenizer ---
# We start with an empty BPE model.
# The <unk> token is used for out-of-vocabulary words.
tokenizer = Tokenizer(BPE(unk_token="<|endoftext|>"))

# --- 3. Configure the Trainer ---
# The BpeTrainer will learn the merge rules from our data.
# vocab_size: The total number of tokens we want in our vocabulary.
# special_tokens: A list of tokens that have special meaning for the model.
trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=1000)

# --- 4. Set a Pre-tokenizer ---
# The pre-tokenizer splits the input text into initial words.
# Splitting by whitespace is a common first step.


tokenizer.pre_tokenizer = Whitespace()  # keep syllable+tsheg intact
#tokenizer.pre_tokenizer = Sequence([Whitespace(), CharDelimiterSplit("་")])
# Pre-tokenizer: whitespace + tsheg + shad
#tokenizer.pre_tokenizer = Sequence([
#    Whitespace(),
#    Split("་", behavior="isolated"),   # keep tsheg as its own token
#    Split("།", behavior="isolated")    # keep shad as its own token
#])
# This is the corrected pre-tokenizer from our previous discussion to merge "བྱང་ཆུབ་"
#tokenizer.pre_tokenizer = Sequence([
#    Whitespace(), 
#    Split('་', behavior='merged_with_previous')
#])
# --- 5. Train the Tokenizer ---
print("🚀 Starting tokenizer training...")
tokenizer.train(files, trainer)
print("✅ Training complete!")

# --- 6. Save the Tokenizer ---
# The trained tokenizer is saved to a single JSON file.
output_path = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer_bpe/bpe_tokenizer.json"
tokenizer.save(output_path)
print(f"💾 Tokenizer saved to: {output_path}")



✅ Created a sample corpus file at: /home/gangagyatso/Desktop/stt-bpe-trainer/data/corpus/mergedcorpus.txt
🚀 Starting tokenizer training...



✅ Training complete!
💾 Tokenizer saved to: /home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer_bpe/bpe_tokenizer.json


In [3]:
from tokenizers import Tokenizer

# Load full tokenizer config
tokenizer = Tokenizer.from_file("/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer_bpe/bpe_tokenizer.json")

# Test it
output = tokenizer.encode("བདེ་ལེགས་")
print("Tokens:", output.tokens)
print("IDs:", output.ids)


Tokens: ['བདེ', '་', 'ལེགས', '་']
IDs: [234, 4, 746, 4]


In [4]:

# --- 7. Load and Test the Tokenizer ---
print("\n--- Testing the new tokenizer ---")
# Load the tokenizer from the saved file
loaded_tokenizer = Tokenizer.from_file("data/tokenizer_bpe/bpe_tokenizer.json")

# Test encoding a sentence
sentence = "བྱང་ཆུབ་ཀྱི་སེམས་རྣམ་པ་གཉིས་ཡོད་རེད། བྱང་ཆུབ་ཀྱི་སེམས་ཡོད་ན་ཡེ་ཤེས་ལྷ་ལ་འགྱུར་འགྲོ་གི་ཡོད་རེད། བྱང་ཆུབ་ཀྱི་སེམས་མེད་ན། དེ་ནས་འདི་"
output = loaded_tokenizer.encode(sentence)

print(f"Original sentence: {sentence}")
print(f"Encoded tokens (IDs): {output.ids}")
print(f"Encoded tokens (strings): {output.tokens}")
print(f"Decoded tokens: {loaded_tokenizer.decode(output.ids)}")
print(f"token len: {len(output.tokens)}")
# The BPE algorithm learned to merge "low" + "er" -> "lower" and "new" + "er" -> "newer"
sentence_2 = "བླ་མ་དགེ་བའི་བཤེས་གཉེན་སྐུ་དགེ་འདུན་གཅིག་ཡིན་ན།"
output_2 = loaded_tokenizer.encode(sentence_2)
print(f"\nOriginal sentence: {sentence_2}")
print(f"Encoded tokens (IDs): {output_2.ids}")
print(f"Encoded tokens (strings): {output_2.tokens}")
print(f"Decoded tokens: {loaded_tokenizer.decode(output_2.ids)}")
print(f"token len: {len(output_2.tokens)}")



--- Testing the new tokenizer ---
Original sentence: བྱང་ཆུབ་ཀྱི་སེམས་རྣམ་པ་གཉིས་ཡོད་རེད། བྱང་ཆུབ་ཀྱི་སེམས་ཡོད་ན་ཡེ་ཤེས་ལྷ་ལ་འགྱུར་འགྲོ་གི་ཡོད་རེད། བྱང་ཆུབ་ཀྱི་སེམས་མེད་ན། དེ་ནས་འདི་
Encoded tokens (IDs): [272, 4, 299, 4, 137, 4, 148, 4, 248, 4, 31, 4, 227, 4, 117, 4, 109, 6, 272, 4, 299, 4, 137, 4, 148, 4, 117, 4, 30, 4, 315, 4, 168, 4, 256, 4, 45, 4, 384, 4, 191, 4, 116, 4, 117, 4, 109, 6, 272, 4, 299, 4, 137, 4, 148, 4, 156, 4, 30, 6, 105, 4, 107, 4, 150, 4]
Encoded tokens (strings): ['བྱང', '་', 'ཆུབ', '་', 'ཀྱི', '་', 'སེམས', '་', 'རྣམ', '་', 'པ', '་', 'གཉིས', '་', 'ཡོད', '་', 'རེད', '།', 'བྱང', '་', 'ཆུབ', '་', 'ཀྱི', '་', 'སེམས', '་', 'ཡོད', '་', 'ན', '་', 'ཡེ', '་', 'ཤེས', '་', 'ལྷ', '་', 'ལ', '་', 'འགྱུར', '་', 'འགྲོ', '་', 'གི', '་', 'ཡོད', '་', 'རེད', '།', 'བྱང', '་', 'ཆུབ', '་', 'ཀྱི', '་', 'སེམས', '་', 'མེད', '་', 'ན', '།', 'དེ', '་', 'ནས', '་', 'འདི', '་']
Decoded tokens: བྱང ་ ཆུབ ་ ཀྱི ་ སེམས ་ རྣམ ་ པ ་ གཉིས ་ ཡོད ་ རེད ། བྱང ་ ཆུབ ་ ཀྱི ་ སེམས ་ ཡོད ་ ན ་ ཡེ ་ ཤེས ་ 

In [5]:
import os
import json
from tokenizers import Tokenizer

# Path to the saved tokenizer.json (from Script 1)
tokenizer_path = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer_bpe/bpe_tokenizer.json"
output_dir = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer_bpe/"
os.makedirs(output_dir, exist_ok=True)

# Load tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)

# Extract vocab (token → id mapping)
vocab = tokenizer.get_vocab()
# sort by id (important!)
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])

with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as f:
    json.dump({token: idx for token, idx in sorted_vocab}, f, ensure_ascii=False, indent=2)

print(f"✅ Saved vocab.json to {output_dir}/vocab.json")

# Extract merges
model = tokenizer.to_str()  # stringified JSON of the whole tokenizer
model_json = json.loads(model)

if "model" in model_json and "merges" in model_json["model"]:
    merges = model_json["model"]["merges"]
    with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as f:
        f.write("#version: 0.2\n")
        for merge in merges:
            f.write(" ".join(merge) + "\n")
    print(f"✅ Saved merges.txt to {output_dir}/merges.txt")
else:
    print("⚠️ No merges found in tokenizer.json (check if it's really BPE)")


✅ Saved vocab.json to /home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer_bpe//vocab.json
✅ Saved merges.txt to /home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer_bpe//merges.txt


In [23]:
from tokenizers import CharBPETokenizer

# Initialize tokenizer
tokenizer = CharBPETokenizer(lowercase=False)

# Train tokenizer
tokenizer.train(
    files=[corpus_file],
    vocab_size=vocab_size,
    min_frequency=2,
    special_tokens=["<unk>", "<pad>", "</s>"]
)

# Save vocab + merges
tokenizer.save_model("/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer_bpe_char")

print("💾 vocab.json and merges.txt written to ./tokenizer_bpe/")





💾 vocab.json and merges.txt written to ./tokenizer_bpe/


In [None]:
merges_txt = "data/tokenizer_bpe/merges.txt"


with open(merges_txt, "r", encoding="utf-8") as f:
    merges = [line.strip() for line in f if line and not line.startswith("#")]

print("🔎 Total merge rules learned:", len(merges))

# Show first 50 merges (the most frequent pairs in your corpus)
print("\nTop 50 merge rules:\n")
for i, merge in enumerate(merges[:50], 1):
    print(f"{i:02d}. {merge}")


In [None]:
from tokenizers.implementations import CharBPETokenizer

# Load from vocab + merges
tokenizer = CharBPETokenizer(
    "data/tokenizer_bpe_char/vocab.json",
    "data/tokenizer_bpe_char/merges.txt"
)

# Test it
output = tokenizer.encode("བདེ་ལེགས་")
print("Tokens:", output.tokens)
print("IDs:", output.ids)


In [None]:
from tokenizers.implementations import CharBPETokenizer

# Load from vocab + merges
loaded_tokenizer = CharBPETokenizer(
    "data/tokenizer_bpe_char/vocab.json",
    "data/tokenizer_bpe_char/merges.txt"
)
# Test encoding a sentence
sentence = "བྱང་ཆུབ་ཀྱི་སེམས་རྣམ་པ་གཉིས་ཡོད་རེད། བྱང་ཆུབ་ཀྱི་སེམས་ཡོད་ན་ཡེ་ཤེས་ལྷ་ལ་འགྱུར་འགྲོ་གི་ཡོད་རེད། བྱང་ཆུབ་ཀྱི་སེམས་མེད་ན། དེ་ནས་འདི་"
output = loaded_tokenizer.encode(sentence)

print(f"Original sentence: {sentence}")
print(f"Encoded tokens (IDs): {output.ids}")
print(f"Encoded tokens (strings): {output.tokens}")

# The BPE algorithm learned to merge "low" + "er" -> "lower" and "new" + "er" -> "newer"
sentence_2 = "བླ་མ་དགེ་བའི་བཤེས་གཉེན་སྐུ་དགེ་འདུན་གཅིག་ཡིན་ན།"
output_2 = loaded_tokenizer.encode(sentence_2)
print(f"\nOriginal sentence: {sentence_2}")
print(f"Encoded tokens (IDs): {output_2.ids}")
print(f"Encoded tokens (strings): {output_2.tokens}")

In [53]:
# training tokenizer
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=['/home/gangagyatso/Desktop/stt-bpe-trainer/data/corpus/mergedcorpus.txt'],
                    min_frequency=2,
                   )






In [54]:
tokenizer.save("./data/tokenizer.json")

In [50]:
import os
import json
from tokenizers import Tokenizer

# Path to the saved tokenizer.json (from Script 1)
tokenizer_path = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer.json"
output_dir = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/"
os.makedirs(output_dir, exist_ok=True)

# Load tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)

# Extract vocab (token → id mapping)
vocab = tokenizer.get_vocab()
# sort by id (important!)
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])

with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as f:
    json.dump({token: idx for token, idx in sorted_vocab}, f, ensure_ascii=False, indent=2)

print(f"✅ Saved vocab.json to {output_dir}/vocab.json")

# Extract merges
model = tokenizer.to_str()  # stringified JSON of the whole tokenizer
model_json = json.loads(model)

if "model" in model_json and "merges" in model_json["model"]:
    merges = model_json["model"]["merges"]
    with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as f:
        f.write("#version: 0.2\n")
        for merge in merges:
            f.write(" ".join(merge) + "\n")
    print(f"✅ Saved merges.txt to {output_dir}/merges.txt")
else:
    print("⚠️ No merges found in tokenizer.json (check if it's really BPE)")


✅ Saved vocab.json to /home/gangagyatso/Desktop/stt-bpe-trainer/data//vocab.json
✅ Saved merges.txt to /home/gangagyatso/Desktop/stt-bpe-trainer/data//merges.txt


In [None]:
# loading tokenizer
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
old_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Tibetan", task="transcribe")
tokenizer = WhisperTokenizer(vocab_file='/home/gangagyatso/Desktop/stt-bpe-trainer/data/vocab.json',
                            merges_file='/home/gangagyatso/Desktop/stt-bpe-trainer/data/merges.txt',
                             unk_token='',
                             bos_token= '<|endoftext|>',
                             pad_token= '<|endoftext|>',
                             model_max_length = 1024,
                            language='Tibetan', task='transcribe')
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="bo", task="transcribe")

tokenizer.add_special_tokens({
    'additional_special_tokens': old_tokenizer.special_tokens_map['additional_special_tokens']
})



  


106

In [52]:
tokenizer.save_pretrained("/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer")

('/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer/tokenizer_config.json',
 '/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer/special_tokens_map.json',
 '/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer/vocab.json',
 '/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer/merges.txt',
 '/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer/normalizer.json',
 '/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer/added_tokens.json')