In [None]:
import json
import os
from pathlib import Path
from transformers import PreTrainedTokenizerFast

# -------------------------------
# 1. Paths
# -------------------------------
WHISPER_DIR = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer"              # path to original whisper tokenizer files
TIB_BPE_DIR = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer_bpe"                       # path where your trained tibetan vocab.json + merges.txt live
OUTPUT_DIR = Path("whisper_tibetan_extended")     # save here
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
import os
import json
from tokenizers import Tokenizer

# Path to the saved tokenizer.json (from Script 1)
tokenizer_path = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer/tokenizer.json"
output_dir = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer/"
os.makedirs(output_dir, exist_ok=True)

# Load tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)

# Extract vocab (token → id mapping)
vocab = tokenizer.get_vocab()
# sort by id (important!)
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])

with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as f:
    json.dump({token: idx for token, idx in sorted_vocab}, f, ensure_ascii=False, indent=2)

print(f"✅ Saved vocab.json to {output_dir}/vocab.json")

# Extract merges
model = tokenizer.to_str()  # stringified JSON of the whole tokenizer
model_json = json.loads(model)

if "model" in model_json and "merges" in model_json["model"]:
    merges = model_json["model"]["merges"]
    with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as f:
        f.write("#version: 0.2\n")
        for merge in merges:
            f.write(" ".join(merge) + "\n")
    print(f"✅ Saved merges.txt to {output_dir}/merges.txt")
else:
    print("⚠️ No merges found in tokenizer.json (check if it's really BPE)")


In [None]:
# -------------------------------
# 2. Load Whisper vocab + merges
# -------------------------------
with open(Path(WHISPER_DIR) / "vocab.json", "r", encoding="utf-8") as f:
    whisper_vocab = json.load(f)

with open(Path(WHISPER_DIR) / "merges.txt", "r", encoding="utf-8") as f:
    whisper_merges = f.read().splitlines()[1:]  # skip "#version"

print(f"📘 Whisper vocab size: {len(whisper_vocab)}")
print(f"📘 Whisper merges: {len(whisper_merges)}")

# -------------------------------
# 3. Load Tibetan vocab + merges
# -------------------------------
with open(Path(TIB_BPE_DIR) / "vocab.json", "r", encoding="utf-8") as f:
    tib_vocab = json.load(f)

with open(Path(TIB_BPE_DIR) / "merges.txt", "r", encoding="utf-8") as f:
    tib_merges = f.read().splitlines()[1:]  # skip "#version"

print(f"📗 Tibetan vocab size: {len(tib_vocab)}")
print(f"📗 Tibetan merges: {len(tib_merges)}")

In [None]:
from transformers import PreTrainedTokenizerFast

tok = PreTrainedTokenizerFast.from_pretrained("openai/whisper-small")
print("Original size:", len(tok))


In [None]:
tok.save_pretrained("data/whisper_tokenizer")

In [None]:
print(tib_vocab.keys())
new_tokens = list(tib_vocab.keys())

In [None]:
tok.add_tokens([t for t in new_tokens if t not in tok.get_vocab()])
print(f"✅ Added {len(new_tokens)} tokens to tokenizer")


In [None]:

# Add returns how many were actually new
tok.add_tokens(new_tokens)

print("Extended size:", len(tok))


In [None]:
tok.save_pretrained("data/whisper_tokenizer_added_tibetan")

In [None]:
import os
import json
from tokenizers import Tokenizer

# Path to the saved tokenizer.json (from Script 1)
tokenizer_path = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer.json"
output_dir = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/"
os.makedirs(output_dir, exist_ok=True)

# Load tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)

# Extract vocab (token → id mapping)
vocab = tokenizer.get_vocab()
# sort by id (important!)
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])

with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as f:
    json.dump({token: idx for token, idx in sorted_vocab}, f, ensure_ascii=False, indent=2)

print(f"✅ Saved vocab.json to {output_dir}/vocab.json")

# Extract merges
model = tokenizer.to_str()  # stringified JSON of the whole tokenizer
model_json = json.loads(model)

if "model" in model_json and "merges" in model_json["model"]:
    merges = model_json["model"]["merges"]
    with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as f:
        f.write("#version: 0.2\n")
        for merge in merges:
            f.write(" ".join(merge) + "\n")
    print(f"✅ Saved merges.txt to {output_dir}/merges.txt")
else:
    print("⚠️ No merges found in tokenizer.json (check if it's really BPE)")


In [None]:
tib_merges

In [1]:
tib_merge = [
      "ད ེ",
      "ེ ད",
      "ན ས",
      "ག ས",
      "ར ེད",
      "ོ ད",
      "འ ི",
      "འ ད",
      "བ ས",
      "མ ས",
      "ྱ ི",
      "ག ི",
      "ཡ ོད",
      "ི ག",
      "བ ྱ",
      "ོ ང",
      "ི ན",
      "ོ ས",
      "ྒ ྱ",
      "ར ྒྱ",
      "ད ག",
      "ེ ར",
      "ུ ང",
      "ཅ ིག",
      "ས ྐ",
      "པ ོ",
      "ེ ས",
      "ད ང",
      "ོ ན",
      "ེ ན",
      "ུ ག",
      "ད ུ",
      "ཀ ྱི",
      "ཟ ེར",
      "ལ ྟ",
      "ས ེ",
      "ག ྲ",
      "པ འི",
      "ར ང",
      "ཡ ིན",
      "ཨ ེ",
      "བ ར",
      "འད ྲ",
      "སེ མས",
      "ོ ག",
      "འད ི",
      "མ ི",
      "ོ གས",
      "ག ཅིག",
      "་ །",
      "ུ བ",
      "མ ེད",
      "ང ས",
      "བྱ ས",
      "ཡ ང",
      "སྐ ྱ",
      "བ ཞ",
      "ཉ ི",
      "ོ ར",
      "ཅ ད",
      "བ ུ",
      "ག ྱི",
      "རྒྱ ུ",
      "ཤ ེས",
      "འ གྲ",
      "བ ླ",
      "ཐ མས",
      "ལ ས",
      "ན ང",
      "ི ང",
      "ས ྡ",
      "དག ོས",
      "གས ུང",
      "ད པ",
      "ས ྒ",
      "ར ྟ",
      "ཀ ི",
      "དུ ས",
      "ར ེ",
      "ཚ ོ",
      "བ ོ",
      "ས ྟ",
      "ུ གས",
      "ཡ ི",
      "བྱ ེད",
      "འད ུག",
      "འགྲ ོ",
      "ཡ ོང",
      "བ འི",
      "བས ྡ",
      "ས ྤ",
      "ཨ ོ",
      "ུ ས",
      "ག ཞ",
      "ག ང",
      "པ ར",
      "གི ས",
      "ད ྲ",
      "ཁ ྱ",
      "ཆ ོས",
      "ར ྣ",
      "བསྡ ད",
      "ུ མ",
      "བས མ",
      "མ ཐ",
      "ུ ར",
      "ུ ལ",
      "རྒྱ ས",
      "ག ནས",
      "ཆ གས",
      "ཆ ེན",
      "ག ཉི",
      "ས ངས",
      "འ ཛ",
      "གསུང ས",
      "ར ྗ",
      "ག ཏ",
      "བ ྲ",
      "ག ོ",
      "བཞ ིན",
      "ས ོ",
      "ུ ན",
      "གཉི ས",
      "ས ོང",
      "ར ྩ",
      "ཁ ོ",
      "ོ མ",
      "བ དག",
      "ོང ས",
      "བ དེ",
      "ད ོན",
      "སྐྱ ེ",
      "ཅ ན",
      "ས ྙ",
      "འཛ ིན",
      "ས ུ",
      "ཟ ེ",
      "ོ བ",
      "གས ུམ",
      "བ ཟ",
      "འ ཁ",
      "བ ཏ",
      "ོ ལ",
      "རྣ མ",
      "བླ ོ",
      "འ ཇ",
      "ར ི",
      "ལ མ",
      "སྡ ུག",
      "ཐ ུབ",
      "བར ྟ",
      "ལ ྷ",
      "ཕ ྱི",
      "དེ འི",
      "དང ོས",
      "རྗ ེ",
      "འདྲ འི",
      "མ ང",
      "སྐ བས",
      "བས ྔ",
      "གཞ ན",
      "འ བྲ",
      "ཞ ེ",
      "ྐ ྱ",
      "དཔ ེ",
      "ལ ེན",
      "ད མ",
      "བྱ ང",
      "དག ེ",
      "བྱ ུང",
      "སྙ ིང",
      "བ ཤ",
      "ས ྔ",
      "ྫ ས",
      "སྒ ྲ",
      "ཉི ད",
      "མ ཚ",
      "སྟ ོང",
      "ས ླ",
      "པ ས",
      "བ ཅ",
      "ད བ",
      "ས ྣ",
      "བསྔ ལ",
      "འཁ ོར",
      "ག ྱ",
      "བ རྒྱ",
      "ཀྱི ས",
      "ཕ ན",
      "ས ྲ",
      "ས ྦ",
      "མ ཁ",
      "ཕ ྱ",
      "བཞ ག",
      "ཆ ུབ",
      "ཕ ྲ",
      "སྐ ུ",
      "བརྟ ེན",
      "བཤ ད",
      "ན ི",
      "དཔ ེར",
      "ཉ མས",
      "ཞ ིག",
      "མ ཆ",
      "བར ྫས",
      "ཡ ོན",
      "ཉ ོན",
      "གྱ ུར",
      "བཏ ང",
      "ལ ུས",
      "ཡ ེ",
      "དབ ང",
      "ཆ ེ",
      "བར ྩ",
      "ག ཟ",
      "འདྲ ས",
      "མ ོངས",
      "འ ོ",
      "མཐ ོང",
      "ྱ ང",
      "སྐྱ ེས",
      "གྱི ས",
      "ཐ ོབ",
      "ད ཀ",
      "ཏ ུ",
      "ེ ལ",
      "དྲ ན",
      "སྒ ོམ",
      "རྒྱུ ད",
      "ས ོགས",
      "ལ ུགས",
      "ཕ ར",
      "རྟ ོག",
      "ར ྫ",
      "རྒྱ ལ",
      "བྱ མས",
      "སྤ ྱ",
      "ས ྨ",
      "ཙ མ",
      "ཚ ང",
      "ཁ ྲ",
      "ར བ",
      "ལ ོ",
      "ཀ ྱང",
      "གས ལ",
      "ཏ ན",
      "ང ོ",
      "སྤ ྲ",
      "ཐ ོག",
      "ལ བ",
      "ལ ེ",
      "ཚ ེ",
      "སྣ ང",
      "མ ར",
      "སྡ ང",
      "རྟ ེན",
      "ར ེས",
      "ཙ ང",
      "ལ ྡ",
      "ཚ ོས",
      "ཡ ག",
      "ལྟ ར",
      "ན མ",
      "བ ཀ",
      "འད ོད",
      "གྲ ུབ",
      "བས ྐྱ",
      "བརྩ ེ",
      "ཐ ུག",
      "བས ྟ",
      "བས ྒ",
      "འ ཆ",
      "ཚ ོགས",
      "ར ྡ",
      "ཀ ུན",
      "འབྲ ས",
      "ཁ ོང",
      "ར ླ",
      "ཆ ུང",
      "འ གྱུར",
      "ཡི ད",
      "གཞ ི",
      "ལ ྔ",
      "སྤྱ ོད",
      "ར ྐྱ",
      "ར ིག",
      "རི མ",
      "ཚ ུལ",
      "རྟ ོགས",
      "ཚ ད",
      "ཤ ི",
      "རི གས",
      "མ ོ",
      "ཚ ར",
      "ག ཉ",
      "ག ན",
      "ྱི ན",
      "ལ ག",
      "གཏ ོགས",
      "ག ལ",
      "ྱ ོང",
      "བཞ ི",
      "ལྡ ན",
      "ཆ ུ",
      "སྟ ེ",
      "མཚ ན",
      "ཚ ུར",
      "ག ླ",
      "ི གས",
      "ཞ ེས",
      "མ འི",
      "ད ོ",
      "གཟ ུགས",
      "དེ ས",
      "བསྐྱ ེད",
      "ར ིང",
      "ེ བས",
      "ི ད",
      "རྣ མས",
      "ཡ ུལ",
      "འ ཕ",
      "སྐ ད",
      "ཡི ས",
      "འདི འི",
      "འ བྱུང",
      "སྒྲ ུབ",
      "པོ འི",
      "མཆ ོག",
      "ཚ ིག",
      "དྲ ུག",
      "རྫ ོགས",
      "ས ང",
      "འབྲ ེལ",
      "ཕྱ ག",
      "རྗ ེས",
      "འཇ ིག",
      "ཚོ འི",
      "ཁྱ ོད",
      "ཤ ས",
      "སྔ གས",
      "བརྒྱ བ",
      "ག ཙ",
      "ད ད",
      "གཉ ེན",
      "མ ིན",
      "ཁ མས",
      "གྲ ོགས",
      "ཤ ར",
      "གཏ ོང",
      "ཞ ིང",
      "ང ན",
      "ཐ ུགས",
      "ཐ བས",
      "འཇ ུག",
      "འད ས",
      "སླ ོབ",
      "མཁ ན",
      "སྤ ྱི",
      "མ ེ",
      "གཙ ོ",
      "བ ལྟ",
      "ཚ ོར",
      "སྐྱ ོན",
      "ཐ ག",
      "སྒ ོ",
      "རྒྱུ ན",
      "ཁྱ བ",
      "ཐ ེ",
      "ཁྱ ད",
      "གན ོད",
      "མ ཉ",
      "ཡ ར",
      "བསྟ ན",
      "བ ཙ",
      "ཕྱ ོགས",
      "ཏ ོག",
      "ར ས",
      "ག ད",
      "མཁ འ",
      "དཀ འ",
      "མཐ འ",
      "རྟ ག",
      "ག ེ",
      "རྒྱ ག",
      "ག ཤ",
      "ཕྱི ན",
      "བདེ ན",
      "འ ཕྲ",
      "མཉ མ",
      "སྐྱ བས",
      "ཞ ི",
      "འཁ ྲ",
      "བཟ ུང",
      "ཆ ད",
      "ག ུ",
      "རླ ུང",
      "སྔ ོན",
      "བཟ ང",
      "འ ང",
      "རྡ ོ",
      "མི གས",
      "ད མིགས",
      "བས མས",
      "ང ེས",
      "བྲ ལ",
      "མཆ ོད",
      "མ ད",
      "སྲ ིད",
      "གྲ ོལ",
      "བཅ ུ",
      "མཐ ར",
      "ང ག",
      "འཆ ི",
      "ཕྱི ར",
      "ི ས",
      "ར ིན",
      "འ ོད",
      "ཁྱ ེད",
      "བཀ འ",
      "དག འ",
      "འ ག",
      "སྒ ང",
      "འ ོང",
      "མ ྱོང",
      "སྦ ྱ",
      "ཐ ར",
      "ེ གས",
      "ོ བས",
      "ཏ ེ",
      "ཚ ོད",
      "མཚ མས",
      "སྦ ྱིན",
      "ང ེ",
      "ལ ུང",
      "རྐྱ ེན",
      "སྟ ངས",
      "བསྒ ྲ",
      "འ བྱ",
      "བ ག",
      "ར ུ",
      "སླ ེབས",
      "ང ོས",
      "ལྷ ག",
      "སྨ ོན",
      "ཇ ི",
      "བ ཤེས",
      "ཟ ག",
      "མ ཛ",
      "ལ ོངས",
      "ཤ ོག",
      "ད བུ",
      "གླ ིང",
      "ཅ ི",
      "དཔ ྱ",
      "བས གས",
      "འབྱ ོར",
      "སྟེ ང",
      "ག ཅ",
      "ཆ ོག",
      "ལ ོག",
      "གས ང",
      "ཁྲ ི",
      "དེ ར",
      "པ ུ",
      "ཁ ང",
      "ཙ ི",
      "འགྲ ིག",
      "སྡ ོམ",
      "ཉ ིན",
      "བལྟ ས",
      "ཟ ོལ",
      "མ ྱི",
      "གྲ ངས",
      "བར ྗ",
      "སྟ ོན",
      "ར ོལ",
      "ར ྨ",
      "དུ ན",
      "བཟ ོ",
      "གཤ ེགས",
      "སྤྲ ུལ",
      "ཁ ག",
      "ཕ ུང",
      "རྐྱ ང",
      "སྲ ོག",
      "བས ླ",
      "གཏ ང",
      "ར ྒ",
      "བུ འི",
      "ག ོང",
      "འ ཁྱ",
      "ཅ ང",
      "ཀི ས",
      "དྲ ག",
      "ཧ ཱ",
      "སྡ ིག",
      "བཅ ས",
      "ཟ ད",
      "སྟ ོབས",
      "ཀ ླ",
      "ཆ ེས",
      "ད བྱ",
      "ར ྙ",
      "གས ོལ",
      "མད ོར",
      "རྟ གས",
      "ཤ ག",
      "རྒྱ ོ",
      "དཀ ོན",
      "འཛ མ",
      "མང ོན",
      "སྐ ོར",
      "ཞ ུ",
      "ག ནང",
      "མཐ ུན",
      "སོ འི",
      "ཁྲ ོ",
      "ྱི ད",
      "ཧཱ ུ",
      "བས ོད",
      "བཅ ོམ",
      "དུ ད",
      "ཨ ུམ",
      "བ ྱིན",
      "ོ མས",
      "བཏ བ",
      "བརྟ ག",
      "ཞ ུས",
      "བཟ ོད",
      "མ ག",
      "ར ྫས",
      "ཤ ིང",
      "ན ུས",
      "ཕ ལ",
      "སྐ ྱིད",
      "སྦ ྱོང",
      "འད ུས",
      "ན འང",
      "དག ྲ",
      "ར ོ",
      "ན ོར",
      "ུང ས",
      "མཚ ོན",
      "བརྒྱ ད",
      "ན མས",
      "སྡ ོད",
      "མ གོ",
      "བསྒ ོམ",
      "ཕྲ ག",
      "མ ིག",
      "བཏ གས",
      "སྲ ས",
      "དཔྱ ད",
      "ག ཡ",
      "ཆ ེར",
      "སྤྱི ར",
      "འ བ",
      "དྲ ི",
      "ལ ོགས",
      "འ ཚ",
      "མ ཁྱ",
      "ཤ ུགས",
      "འ བུ",
      "ཐེ ག",
      "མི ང",
      "མཁྱ ེན",
      "ཞ ེན",
      "ག ུས",
      "གཉི ད",
      "སྨ ིན",
      "རླ བས",
      "བརྗ ོད",
      "མག ོན",
      "ཕ ུལ",
      "བ དུན",
      "ས ྒྱ",
      "ར ུང",
      "ས ུམ",
      "ད ཱ",
      "བསྡ ུས",
      "། །",
      "འ ཐ",
      "གཟ ིགས",
      "ཆ ེད",
      "བླ ངས",
      "ྱ ལ",
      "ལ གས",
      "ཟ ིན",
      "འཇ ིགས",
      "དམ ྱལ",
      "ཟ ླ",
      "ི བ",
      "བ ོད",
      "ཁྲི མས",
      "འད ུན",
      "ག ར",
      "ཉ ེས",
      "གཏ ན",
      "མ དོ",
      "མཁ ས",
      "ར ོགས",
      "བ རྒྱུད",
      "བས ྐ",
      "བསྒྲ ུབ",
      "དཀ ར",
      "འད ུ",
      "གཞ ུང",
      "ཟ ུང",
      "ག ཙང",
      "རྣ ལ",
      "སྤྲ ད",
      "སྤ ང",
      "བཞ ུགས",
      "བ ཅད",
      "ཆ ོད",
      "དག ོངས",
      "ི མ",
      "ད ོག",
      "མཛ ད",
      "ཉ ན",
      "ཐ ོས",
      "ད ུག",
      "ད ྭ",
      "ཚོ མ",
      "དཔ ག",
      "འ དེ",
      "སྒྲ ིབ",
      "ང ལ",
      "འ ོག",
      "འཁྲ ུལ",
      "ས ོས",
      "རྙ ེད",
      "བསྟ ེན",
      "རྒ ན",
      "ང ང",
      "སྦྱ ོར",
      "འགྲ ུབ",
      "སྒྱ ུ",
      "ལ ན",
      "བསླ བ",
      "འ ོངས",
      "ལེ གས",
      "ང འི",
      "འགྲ ེལ",
      "ར ག",
      "ཤ ོར",
      "བས ྒྱ",
      "མཐ ོ",
      "ན ག",
      "བུ ར",
      "སྤྱ ན",
      "བསྒྱ ུར",
      "ཐ ོགས",
      "ཨོ ཾ",
      "བསྔ ོ",
      "མ ོས",
      "ུ ད",
      "འཚ ལ",
      "ཡོང ས",
      "འདེ བས",
      "རྨ ི",
      "ཡ ན",
      "ར ྦ",
      "བརྒྱ ག",
      "ཁ ྱི",
      "བཟ ོས",
      "གཅ ེས",
      "རྫ ོབ",
      "དྲ ིན",
      "ཎ ི",
      "ར བས",
      "རྩ ེ",
      "འཕ གས",
      "ཕ ེབས",
      "ཚ ན",
      "འདི ར",
      "ཨ ུ",
      "དཔ འ",
      "སྡ ེ",
      "མ ྱ",
      "སླ ོང",
      "ཤ ོས",
      "པ ད",
      "ཞ ུགས",
      "ར ྐ",
      "འཇ འ",
      "མ ན",
      "ཁ ས",
      "སྲ ུང",
      "ན ད",
      "ཉ ེ",
      "དཔ ལ",
      "ག ཱ",
      "མ ུག",
      "མ ཚོ",
      "གྲ གས",
      "འ ུ",
      "བར ླ",
      "ཏ ག",
      "རྡ ོག",
      "ཐ ལ",
      "གཅ ོད",
      "གཏ ི",
      "བཙ ུན",
      "སྨ ན",
      "འཕ ང",
      "ཐ ང",
      "རྒྱ བ",
      "ད ཀྱི",
      "དཀྱི ལ",
      "བརྩ ོན",
      "ད ུང",
      "བ ོན",
      "ཨ ང",
      "གཞ ུག",
      "བྱ ེ",
      "ཧཱུ ྃ",
      "ཧཱུ ཾ",
      "བརྟ ན",
      "རྦ ད",
      "པོ ར",
      "ལྟ ོས",
      "མ ཇ",
      "ཁ ོས",
      "སྦྱ ར",
      "འདི ས",
      "བས ད",
      "ལྡ ོག",
      "ཟ བ",
      "འཁྱ ག",
      "ེ བ",
      "བཀ ྲ",
      "ེ ང",
      "རི ས",
      "ཐ ུན",
      "པ ོས",
      "བས ྲ",
      "འཆ ར",
      "བཅ ོས",
      "གོ མས",
      "ཤི ས",
      "ཁ བ",
      "ག ཡེ",
      "བསྒ ོམས",
      "བོ འི",
      "སྤྲ ོས",
      "འཇ ོག",
      "ཧ ེ",
      "འད ུལ",
      "ཆ ོ",
      "ཅ ེས",
      "ས ོར",
      "འབ ད",
      "ཁྱ ོན",
      "ཞ བས",
      "སྦ ྱང",
      "ས འི",
      "གྲ ྭ",
      "མ ལ",
      "འཕ ོ",
      "འག ག",
      "ཚ ུན",
      "ང ེད",
      "རྩ ོལ",
      "ལ ངས",
      "སྤྲ ོད",
      "གྲ ས",
      "རྒྱུ ས",
      "ཡ ུན",
      "ས ི",
      "འ ཚོ",
      "ཉ ུང",
      "ཀླ ོག",
      "སྒྲ ོལ",
      "ཏ ིང",
      "སྙ མ",
      "ཅ ིང",
      "འད ོན",
      "བརླ ག",
      "སྐྱ ོབ",
      "ཅ ུ",
      "སྙ ན",
      "ཚ ུགས",
      "ད པོ",
      "ཏ ིག",
      "བ ེ",
      "དབ ྱི",
      "འ གལ",
      "བཟ ླ",
      "སྣ ོད",
      "ས ད",
      "ཁྱི མ",
      "བཤ གས",
      "དཔོ ན",
      "འ མ",
      "སེ ལ",
      "སྨ ྲ",
      "ཞ ལ",
      "སླ ེབ",
      "སྐྱེ ལ",
      "རྡ ུལ",
      "དབྱ ེ",
      "ཤ ུ",
      "རྗེ འི",
      "དྭ གས",
      "ཡ ིག",
      "དཔྱ ོད",
      "ཁ ོག",
      "བ ོར",
      "ཁྲ ིད",
      "འག ོག",
      "སྔ ར",
      "གད མས",
      "སྤ ངས",
      "པ ེ",
      "མ ོང",
      "ར གས",
      "དག ུ",
      "དབྱ ེར",
      "འགྲོ འི",
      "ཤ ིན",
      "དག ག",
      "གཏ ོར",
      "སྐ ུར",
      "ད ལ",
      "དབ ེན",
      "པད ྨ",
      "གན ད",
      "དྲ ང",
      "ལ ྕ",
      "བུ མ",
      "འཕྲ ོ",
      "འཕ ེལ",
      "འབུ ལ",
      "གས ར",
      "རྒྱ ན",
      "བས ྙ",
      "་ ་",
      "གི ན",
      "ཕ ོ",
      "ཡ བ",
      "ི ལ",
      "ཅ ག",
      "ལྟ འི",
      "ས ེར",
      "འཕྲ ད",
      "བཀ ོད",
      "བཙ ུགས",
      "མ དུན",
      "གཟ ུང",
      "བླ ང",
      "ཐ ིམ",
      "ཕ ུན",
      "གི ར",
      "སྒྲ ིག",
      "བར ྡ",
      "སྐ ྲ",
      "ཡ ས",
      "སྤ ོང",
      "མ ུ",
      "གྲ ུ",
      "འཁྱ ེར",
      "ཐ ིག",
      "ར ོམ",
      "གས ོ",
      "མད ོག",
      "ཐ ུང",
      "ས ུས",
      "བ ཱ",
      "རྫ ོ",
      "དབྱི ངས",
      "འཁྲ ུག",
      "རྩ ི",
      "འགྲ ུས",
      "དམ ར",
      "ཀ ྱ",
      "བློ འི",
      "ན ོ",
      "ཁ ུག",
      "ཅ ུང",
      "གད འ",
      "མཁ འི",
      "གཞ ག",
      "ཁ ེ",
      "སྐྱ ེད",
      "ཡ ུམ",
      "ཁྱ ེར",
      "བས ག",
      "གླ ོ",
      "ལ ང",
      "དྲ གས",
      "ལྷ ོད",
      "ལ ོང",
      "བཀ ག",
      "ཆ ག",
      "རྐ ང",
      "བ དུད",
      "གཡེ ང",
      "བསྙ ེན",
      "ུ བས",
      "མཇ ལ",
      "ན ེ",
      "ལྟ ེ",
      "ད ོགས",
      "འཆ ང",
      "ཚ བ",
      "བསྐྱ ར",
      "ཕྱ ེ",
      "ཤ ིག",
      "མི འི",
      "ཉ ེན",
      "དུ ལ",
      "གས ོད",
      "རྙ ིང",
      "ད པའི",
      "གས ེར",
      "ད ྷ",
      "སྐྱ ོང",
      "སྦྱ ངས",
      "ད ོར",
      "ཀླ ུ",
      "བླ ོས",
      "ཐ ོན",
      "ཞི བ",
      "ག ོས",
      "ཕྲ ན",
      "འདྲ ེས",
      "ཐ མ",
      "ད ར",
      "བས ལ",
      "གཉ ེར",
      "སྐྱ ོ",
      "སྙ ད",
      "སྒྲ ོན",
      "སྤྱ ད",
      "ཉ ེར",
      "བསྟ ོད",
      "བཅུ ད",
      "དམ ན",
      "གཟ བ",
      "བསྒྲ ུབས",
      "བྲ ག",
      "བཙ ལ",
      "ག ནམ",
      "འ གྱ",
      "བར ྙ",
      "བ ྱི",
      "ཉ ལ",
      "བཅ ུག",
      "མཆ ེད",
      "ག ཤིས",
      "ལེ འུ",
      "བཟ འ",
      "བཞ ེས",
      "འག འ",
      "སྔ ན",
      "ཕྲ ེང",
      "ཟ ས",
      "འཐ ུང",
      "དེ ང",
      "ལྷ ན",
      "ར ུས",
      "བསླ ུ",
      "གྲ ོང",
      "དབུ གས",
      "དག ོན",
      "བརྩ ི",
      "འདྲ ེན",
      "ཁོ འི",
      "གཡ ོ",
      "བ བས",
      "རྩ ོད",
      "ལྟ ུང",
      "དབྱ ངས",
      "སྦ ྲ",
      "འཇ མ",
      "མོ འི",
      "བུ ད",
      "ཁ ུངས",
      "ཀླ ོང",
      "རེ འི",
      "རྩ ལ",
      "ཇ ོ",
      "ཆ ར",
      "སྐྲ ག",
      "ཎ ྜ",
      "རྨ ོངས",
      "གད ན",
      "བྱ འི",
      "སྐུ འི",
      "རྒྱུ འི",
      "བ གྲ",
      "ཚ ིགས",
      "བཟླ ས",
      "རྫ ུན",
      "ཁྲ ག",
      "སླ ོག",
      "གས ོག",
      "ལ ེར",
      "སྤྲ ིན",
      "བསྐ ལ",
      "ང ར",
      "ང ུ",
      "ལ འང",
      "ཚ ོན",
      "པ འོ",
      "ཕ བ",
      "སྒྲ ོ",
      "ཕྱ ུག",
      "ཁ ུར",
      "དྭ ངས",
      "འགྱ ོད",
      "གད ོང",
      "ཏ ི",
      "འ བྱེད",
      "གས ོན",
      "དག ོང",
      "སེ ང",
      "འད ོགས",
      "ཤ ཱ",
      "སྡ ོང",
      "བརྩ ིས",
      "བཞི འི",
      "རྩ ིས",
      "གླ ོག",
      "ེ ག",
      "ཨ མ",
      "འ གོ",
      "ག ུང",
      "ག དུལ",
      "ཏ ིས",
      "དྲ ོད",
      "སྐ ར",
      "བྲ ིས",
      "ཀ འི",
      "ཨ ཱ",
      "དྲ ངས",
      "འབུ མ",
      "ལྟ ོ",
      "གཏ ིང",
      "ལྕ གས",
      "བརྗ ེད",
      "ལ ྗ",
      "གཏ ད",
      "བྱ འོ",
      "འ ཐོབ",
      "བསྡ ུ"
    ]
  

In [2]:
len(tib_merge)

1047

In [5]:
import json
from pathlib import Path

# -----------------------------
# Paths
# -----------------------------
tokenizer_json_path = Path("/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer.json")
output_path = Path("/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer_added.json")

# Example Tibetan merges (your raw format with spaces)
tib_merges_raw = tib_merge
# -----------------------------
# Convert "ད   ེ" → ["ད", "ེ"]
# -----------------------------
tib_merges = []
for m in tib_merges_raw:
    parts = str(m).split(" ")
    if len(parts) == 2:
        tib_merges.append(parts)
    else:
        print(f"⚠️ Skipping invalid merge: {m}")

# -----------------------------
# Load tokenizer.json
# -----------------------------
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
    tok_data = json.load(f)

# -----------------------------
# Extend merges
# -----------------------------
tok_data["model"]["merges"] = tib_merges


print(f"✅ Added  Tibetan merges. New merge count: ")

# -----------------------------
# Save new tokenizer.json
# -----------------------------
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(tok_data, f, ensure_ascii=False, indent=2)

print(f"💾 Extended tokenizer.json written to {output_path}")


✅ Added  Tibetan merges. New merge count: 
💾 Extended tokenizer.json written to /home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer_added.json


In [None]:
import json
from pathlib import Path

# -----------------------------
# Paths
# -----------------------------
tokenizer_json_path = Path("/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer.json")
output_path = Path("/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer_added.json")

# Example Tibetan merges (your raw format with spaces)
tib_merges_raw = tib_merge
# -----------------------------
# Convert "ད   ེ" → ["ད", "ེ"]
# -----------------------------
tib_merges = []
for m in tib_merges_raw:
    parts = str(m).split(" ")
    if len(parts) == 2:
        tib_merges.append(parts)
    else:
        print(f"⚠️ Skipping invalid merge: {m}")

# -----------------------------
# Load tokenizer.json
# -----------------------------
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
    tok_data = json.load(f)

# -----------------------------
# Extend merges
# -----------------------------
merges = tok_data["model"]["merges"]

added = 0
for m in tib_merges:
    if m not in merges:
        merges.append(m)
        added += 1

print(f"✅ Added {added} Tibetan merges. New merge count: {len(merges)}")

# -----------------------------
# Save new tokenizer.json
# -----------------------------
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(tok_data, f, ensure_ascii=False, indent=2)

print(f"💾 Extended tokenizer.json written to {output_path}")


In [8]:
import os
import json
from tokenizers import Tokenizer

# Path to the saved tokenizer.json (from Script 1)
tokenizer_path = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer_added.json"
output_dir = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan"
os.makedirs(output_dir, exist_ok=True)

# Load tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)

# Extract vocab (token → id mapping)
vocab = tokenizer.get_vocab()
# sort by id (important!)
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])

with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as f:
    json.dump({token: idx for token, idx in sorted_vocab}, f, ensure_ascii=False, indent=2)

print(f"✅ Saved vocab.json to {output_dir}/vocab.json")

# Extract merges
model = tokenizer.to_str()  # stringified JSON of the whole tokenizer
model_json = json.loads(model)

if "model" in model_json and "merges" in model_json["model"]:
    merges = model_json["model"]["merges"]
    with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as f:
        f.write("#version: 0.2\n")
        for merge in merges:
            f.write(" ".join(merge) + "\n")
    print(f"✅ Saved merges.txt to {output_dir}/merges.txt")
else:
    print("⚠️ No merges found in tokenizer.json (check if it's really BPE)")


✅ Saved vocab.json to /home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/vocab.json
The OrderedVocab you are attempting to save contains holes for indices [50258, 50259, 50260, 50261, 50262, 50263, 50264, 50265, 50266, 50267, 50268, 50269, 50270, 50271, 50272, 50273, 50274, 50275, 50276, 50277, 50278, 50279, 50280, 50281, 50282, 50283, 50284, 50285, 50286, 50287, 50288, 50289, 50290, 50291, 50292, 50293, 50294, 50295, 50296, 50297, 50298, 50299, 50300, 50301, 50302, 50303, 50304, 50305, 50306, 50307, 50308, 50309, 50310, 50311, 50312, 50313, 50314, 50315, 50316, 50317, 50318, 50319, 50320, 50321, 50322, 50323, 50324, 50325, 50326, 50327, 50328, 50329, 50330, 50331, 50332, 50333, 50334, 50335, 50336, 50337, 50338, 50339, 50340, 50341, 50342, 50343, 50344, 50345, 50346, 50347, 50348, 50349, 50350, 50351, 50352, 50353, 50354, 50355, 50356, 50357, 50358, 50359, 50360, 50361, 50362, 50363, 50364, 50365, 50366, 50367, 50368, 50369, 50370, 50371, 50372, 50373, 5037

In [9]:
from transformers import PreTrainedTokenizerFast

OUTPUT_DIR = "data/whisper_tokenizer_added_tibetan"
tok = PreTrainedTokenizerFast.from_pretrained(OUTPUT_DIR)
print(f"✅ Successfully loaded tokenizer with {len(tok)} tokens")

✅ Successfully loaded tokenizer with 53014 tokens


In [10]:
from transformers import WhisperTokenizer

# Load the base tokenizer (keeps all Whisper special tokens)
old_tokenizer = WhisperTokenizer.from_pretrained("/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'WhisperTokenizer'.


In [11]:
old_tokenizer.save_pretrained("data/whisper_tokenizer_added_tibetan_class")

('data/whisper_tokenizer_added_tibetan_class/tokenizer_config.json',
 'data/whisper_tokenizer_added_tibetan_class/special_tokens_map.json',
 'data/whisper_tokenizer_added_tibetan_class/vocab.json',
 'data/whisper_tokenizer_added_tibetan_class/merges.txt',
 'data/whisper_tokenizer_added_tibetan_class/normalizer.json',
 'data/whisper_tokenizer_added_tibetan_class/added_tokens.json')

In [12]:
test = "བྱང་ཆུབ་ སེམས་ ཡོད།"
ids = tok.encode(test)
print("IDs:", ids)
print("Tokens:", tok.convert_ids_to_tokens(ids))
print("token len:", len(ids))


IDs: [50258, 50363, 52134, 51866, 52161, 51866, 220, 52010, 51866, 220, 51979, 51868, 50257]
Tokens: [None, None, 'བྱང', '་', 'ཆུབ', '་', 'Ġ', 'སེམས', '་', 'Ġ', 'ཡོད', '།', '<|endoftext|>']
token len: 13
