In [None]:
import json
import os
from pathlib import Path
from transformers import PreTrainedTokenizerFast

# -------------------------------
# 1. Paths
# -------------------------------
WHISPER_DIR = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer"              # path to original whisper tokenizer files
TIB_BPE_DIR = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/tokenizer_bpe"                       # path where your trained tibetan vocab.json + merges.txt live
OUTPUT_DIR = Path("whisper_tibetan_extended")     # save here
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
import os
import json
from tokenizers import Tokenizer

# Path to the saved tokenizer.json (from Script 1)
tokenizer_path = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer/tokenizer.json"
output_dir = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer/"
os.makedirs(output_dir, exist_ok=True)

# Load tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)

# Extract vocab (token ‚Üí id mapping)
vocab = tokenizer.get_vocab()
# sort by id (important!)
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])

with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as f:
    json.dump({token: idx for token, idx in sorted_vocab}, f, ensure_ascii=False, indent=2)

print(f"‚úÖ Saved vocab.json to {output_dir}/vocab.json")

# Extract merges
model = tokenizer.to_str()  # stringified JSON of the whole tokenizer
model_json = json.loads(model)

if "model" in model_json and "merges" in model_json["model"]:
    merges = model_json["model"]["merges"]
    with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as f:
        f.write("#version: 0.2\n")
        for merge in merges:
            f.write(" ".join(merge) + "\n")
    print(f"‚úÖ Saved merges.txt to {output_dir}/merges.txt")
else:
    print("‚ö†Ô∏è No merges found in tokenizer.json (check if it's really BPE)")


In [None]:
# -------------------------------
# 2. Load Whisper vocab + merges
# -------------------------------
with open(Path(WHISPER_DIR) / "vocab.json", "r", encoding="utf-8") as f:
    whisper_vocab = json.load(f)

with open(Path(WHISPER_DIR) / "merges.txt", "r", encoding="utf-8") as f:
    whisper_merges = f.read().splitlines()[1:]  # skip "#version"

print(f"üìò Whisper vocab size: {len(whisper_vocab)}")
print(f"üìò Whisper merges: {len(whisper_merges)}")

# -------------------------------
# 3. Load Tibetan vocab + merges
# -------------------------------
with open(Path(TIB_BPE_DIR) / "vocab.json", "r", encoding="utf-8") as f:
    tib_vocab = json.load(f)

with open(Path(TIB_BPE_DIR) / "merges.txt", "r", encoding="utf-8") as f:
    tib_merges = f.read().splitlines()[1:]  # skip "#version"

print(f"üìó Tibetan vocab size: {len(tib_vocab)}")
print(f"üìó Tibetan merges: {len(tib_merges)}")

In [None]:
from transformers import PreTrainedTokenizerFast

tok = PreTrainedTokenizerFast.from_pretrained("openai/whisper-small")
print("Original size:", len(tok))


In [None]:
tok.save_pretrained("data/whisper_tokenizer")

In [None]:
print(tib_vocab.keys())
new_tokens = list(tib_vocab.keys())

In [None]:
tok.add_tokens([t for t in new_tokens if t not in tok.get_vocab()])
print(f"‚úÖ Added {len(new_tokens)} tokens to tokenizer")


In [None]:

# Add returns how many were actually new
tok.add_tokens(new_tokens)

print("Extended size:", len(tok))


In [None]:
tok.save_pretrained("data/whisper_tokenizer_added_tibetan")

In [None]:
import os
import json
from tokenizers import Tokenizer

# Path to the saved tokenizer.json (from Script 1)
tokenizer_path = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer.json"
output_dir = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/"
os.makedirs(output_dir, exist_ok=True)

# Load tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)

# Extract vocab (token ‚Üí id mapping)
vocab = tokenizer.get_vocab()
# sort by id (important!)
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])

with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as f:
    json.dump({token: idx for token, idx in sorted_vocab}, f, ensure_ascii=False, indent=2)

print(f"‚úÖ Saved vocab.json to {output_dir}/vocab.json")

# Extract merges
model = tokenizer.to_str()  # stringified JSON of the whole tokenizer
model_json = json.loads(model)

if "model" in model_json and "merges" in model_json["model"]:
    merges = model_json["model"]["merges"]
    with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as f:
        f.write("#version: 0.2\n")
        for merge in merges:
            f.write(" ".join(merge) + "\n")
    print(f"‚úÖ Saved merges.txt to {output_dir}/merges.txt")
else:
    print("‚ö†Ô∏è No merges found in tokenizer.json (check if it's really BPE)")


In [None]:
tib_merges

In [1]:
tib_merge = [
      "‡Ωë ‡Ω∫",
      "‡Ω∫ ‡Ωë",
      "‡Ωì ‡Ω¶",
      "‡ΩÇ ‡Ω¶",
      "‡Ω¢ ‡Ω∫‡Ωë",
      "‡Ωº ‡Ωë",
      "‡Ω† ‡Ω≤",
      "‡Ω† ‡Ωë",
      "‡Ωñ ‡Ω¶",
      "‡Ωò ‡Ω¶",
      "‡æ± ‡Ω≤",
      "‡ΩÇ ‡Ω≤",
      "‡Ω° ‡Ωº‡Ωë",
      "‡Ω≤ ‡ΩÇ",
      "‡Ωñ ‡æ±",
      "‡Ωº ‡ΩÑ",
      "‡Ω≤ ‡Ωì",
      "‡Ωº ‡Ω¶",
      "‡æí ‡æ±",
      "‡Ω¢ ‡æí‡æ±",
      "‡Ωë ‡ΩÇ",
      "‡Ω∫ ‡Ω¢",
      "‡Ω¥ ‡ΩÑ",
      "‡ΩÖ ‡Ω≤‡ΩÇ",
      "‡Ω¶ ‡æê",
      "‡Ωî ‡Ωº",
      "‡Ω∫ ‡Ω¶",
      "‡Ωë ‡ΩÑ",
      "‡Ωº ‡Ωì",
      "‡Ω∫ ‡Ωì",
      "‡Ω¥ ‡ΩÇ",
      "‡Ωë ‡Ω¥",
      "‡ΩÄ ‡æ±‡Ω≤",
      "‡Ωü ‡Ω∫‡Ω¢",
      "‡Ω£ ‡æü",
      "‡Ω¶ ‡Ω∫",
      "‡ΩÇ ‡æ≤",
      "‡Ωî ‡Ω†‡Ω≤",
      "‡Ω¢ ‡ΩÑ",
      "‡Ω° ‡Ω≤‡Ωì",
      "‡Ω® ‡Ω∫",
      "‡Ωñ ‡Ω¢",
      "‡Ω†‡Ωë ‡æ≤",
      "‡Ω¶‡Ω∫ ‡Ωò‡Ω¶",
      "‡Ωº ‡ΩÇ",
      "‡Ω†‡Ωë ‡Ω≤",
      "‡Ωò ‡Ω≤",
      "‡Ωº ‡ΩÇ‡Ω¶",
      "‡ΩÇ ‡ΩÖ‡Ω≤‡ΩÇ",
      "‡ºã ‡ºç",
      "‡Ω¥ ‡Ωñ",
      "‡Ωò ‡Ω∫‡Ωë",
      "‡ΩÑ ‡Ω¶",
      "‡Ωñ‡æ± ‡Ω¶",
      "‡Ω° ‡ΩÑ",
      "‡Ω¶‡æê ‡æ±",
      "‡Ωñ ‡Ωû",
      "‡Ωâ ‡Ω≤",
      "‡Ωº ‡Ω¢",
      "‡ΩÖ ‡Ωë",
      "‡Ωñ ‡Ω¥",
      "‡ΩÇ ‡æ±‡Ω≤",
      "‡Ω¢‡æí‡æ± ‡Ω¥",
      "‡Ω§ ‡Ω∫‡Ω¶",
      "‡Ω† ‡ΩÇ‡æ≤",
      "‡Ωñ ‡æ≥",
      "‡Ωê ‡Ωò‡Ω¶",
      "‡Ω£ ‡Ω¶",
      "‡Ωì ‡ΩÑ",
      "‡Ω≤ ‡ΩÑ",
      "‡Ω¶ ‡æ°",
      "‡Ωë‡ΩÇ ‡Ωº‡Ω¶",
      "‡ΩÇ‡Ω¶ ‡Ω¥‡ΩÑ",
      "‡Ωë ‡Ωî",
      "‡Ω¶ ‡æí",
      "‡Ω¢ ‡æü",
      "‡ΩÄ ‡Ω≤",
      "‡Ωë‡Ω¥ ‡Ω¶",
      "‡Ω¢ ‡Ω∫",
      "‡Ωö ‡Ωº",
      "‡Ωñ ‡Ωº",
      "‡Ω¶ ‡æü",
      "‡Ω¥ ‡ΩÇ‡Ω¶",
      "‡Ω° ‡Ω≤",
      "‡Ωñ‡æ± ‡Ω∫‡Ωë",
      "‡Ω†‡Ωë ‡Ω¥‡ΩÇ",
      "‡Ω†‡ΩÇ‡æ≤ ‡Ωº",
      "‡Ω° ‡Ωº‡ΩÑ",
      "‡Ωñ ‡Ω†‡Ω≤",
      "‡Ωñ‡Ω¶ ‡æ°",
      "‡Ω¶ ‡æ§",
      "‡Ω® ‡Ωº",
      "‡Ω¥ ‡Ω¶",
      "‡ΩÇ ‡Ωû",
      "‡ΩÇ ‡ΩÑ",
      "‡Ωî ‡Ω¢",
      "‡ΩÇ‡Ω≤ ‡Ω¶",
      "‡Ωë ‡æ≤",
      "‡ΩÅ ‡æ±",
      "‡ΩÜ ‡Ωº‡Ω¶",
      "‡Ω¢ ‡æ£",
      "‡Ωñ‡Ω¶‡æ° ‡Ωë",
      "‡Ω¥ ‡Ωò",
      "‡Ωñ‡Ω¶ ‡Ωò",
      "‡Ωò ‡Ωê",
      "‡Ω¥ ‡Ω¢",
      "‡Ω¥ ‡Ω£",
      "‡Ω¢‡æí‡æ± ‡Ω¶",
      "‡ΩÇ ‡Ωì‡Ω¶",
      "‡ΩÜ ‡ΩÇ‡Ω¶",
      "‡ΩÜ ‡Ω∫‡Ωì",
      "‡ΩÇ ‡Ωâ‡Ω≤",
      "‡Ω¶ ‡ΩÑ‡Ω¶",
      "‡Ω† ‡Ωõ",
      "‡ΩÇ‡Ω¶‡Ω¥‡ΩÑ ‡Ω¶",
      "‡Ω¢ ‡æó",
      "‡ΩÇ ‡Ωè",
      "‡Ωñ ‡æ≤",
      "‡ΩÇ ‡Ωº",
      "‡Ωñ‡Ωû ‡Ω≤‡Ωì",
      "‡Ω¶ ‡Ωº",
      "‡Ω¥ ‡Ωì",
      "‡ΩÇ‡Ωâ‡Ω≤ ‡Ω¶",
      "‡Ω¶ ‡Ωº‡ΩÑ",
      "‡Ω¢ ‡æ©",
      "‡ΩÅ ‡Ωº",
      "‡Ωº ‡Ωò",
      "‡Ωñ ‡Ωë‡ΩÇ",
      "‡Ωº‡ΩÑ ‡Ω¶",
      "‡Ωñ ‡Ωë‡Ω∫",
      "‡Ωë ‡Ωº‡Ωì",
      "‡Ω¶‡æê‡æ± ‡Ω∫",
      "‡ΩÖ ‡Ωì",
      "‡Ω¶ ‡æô",
      "‡Ω†‡Ωõ ‡Ω≤‡Ωì",
      "‡Ω¶ ‡Ω¥",
      "‡Ωü ‡Ω∫",
      "‡Ωº ‡Ωñ",
      "‡ΩÇ‡Ω¶ ‡Ω¥‡Ωò",
      "‡Ωñ ‡Ωü",
      "‡Ω† ‡ΩÅ",
      "‡Ωñ ‡Ωè",
      "‡Ωº ‡Ω£",
      "‡Ω¢‡æ£ ‡Ωò",
      "‡Ωñ‡æ≥ ‡Ωº",
      "‡Ω† ‡Ωá",
      "‡Ω¢ ‡Ω≤",
      "‡Ω£ ‡Ωò",
      "‡Ω¶‡æ° ‡Ω¥‡ΩÇ",
      "‡Ωê ‡Ω¥‡Ωñ",
      "‡Ωñ‡Ω¢ ‡æü",
      "‡Ω£ ‡æ∑",
      "‡Ωï ‡æ±‡Ω≤",
      "‡Ωë‡Ω∫ ‡Ω†‡Ω≤",
      "‡Ωë‡ΩÑ ‡Ωº‡Ω¶",
      "‡Ω¢‡æó ‡Ω∫",
      "‡Ω†‡Ωë‡æ≤ ‡Ω†‡Ω≤",
      "‡Ωò ‡ΩÑ",
      "‡Ω¶‡æê ‡Ωñ‡Ω¶",
      "‡Ωñ‡Ω¶ ‡æî",
      "‡ΩÇ‡Ωû ‡Ωì",
      "‡Ω† ‡Ωñ‡æ≤",
      "‡Ωû ‡Ω∫",
      "‡æê ‡æ±",
      "‡Ωë‡Ωî ‡Ω∫",
      "‡Ω£ ‡Ω∫‡Ωì",
      "‡Ωë ‡Ωò",
      "‡Ωñ‡æ± ‡ΩÑ",
      "‡Ωë‡ΩÇ ‡Ω∫",
      "‡Ωñ‡æ± ‡Ω¥‡ΩÑ",
      "‡Ω¶‡æô ‡Ω≤‡ΩÑ",
      "‡Ωñ ‡Ω§",
      "‡Ω¶ ‡æî",
      "‡æ´ ‡Ω¶",
      "‡Ω¶‡æí ‡æ≤",
      "‡Ωâ‡Ω≤ ‡Ωë",
      "‡Ωò ‡Ωö",
      "‡Ω¶‡æü ‡Ωº‡ΩÑ",
      "‡Ω¶ ‡æ≥",
      "‡Ωî ‡Ω¶",
      "‡Ωñ ‡ΩÖ",
      "‡Ωë ‡Ωñ",
      "‡Ω¶ ‡æ£",
      "‡Ωñ‡Ω¶‡æî ‡Ω£",
      "‡Ω†‡ΩÅ ‡Ωº‡Ω¢",
      "‡ΩÇ ‡æ±",
      "‡Ωñ ‡Ω¢‡æí‡æ±",
      "‡ΩÄ‡æ±‡Ω≤ ‡Ω¶",
      "‡Ωï ‡Ωì",
      "‡Ω¶ ‡æ≤",
      "‡Ω¶ ‡æ¶",
      "‡Ωò ‡ΩÅ",
      "‡Ωï ‡æ±",
      "‡Ωñ‡Ωû ‡ΩÇ",
      "‡ΩÜ ‡Ω¥‡Ωñ",
      "‡Ωï ‡æ≤",
      "‡Ω¶‡æê ‡Ω¥",
      "‡Ωñ‡Ω¢‡æü ‡Ω∫‡Ωì",
      "‡Ωñ‡Ω§ ‡Ωë",
      "‡Ωì ‡Ω≤",
      "‡Ωë‡Ωî ‡Ω∫‡Ω¢",
      "‡Ωâ ‡Ωò‡Ω¶",
      "‡Ωû ‡Ω≤‡ΩÇ",
      "‡Ωò ‡ΩÜ",
      "‡Ωñ‡Ω¢ ‡æ´‡Ω¶",
      "‡Ω° ‡Ωº‡Ωì",
      "‡Ωâ ‡Ωº‡Ωì",
      "‡ΩÇ‡æ± ‡Ω¥‡Ω¢",
      "‡Ωñ‡Ωè ‡ΩÑ",
      "‡Ω£ ‡Ω¥‡Ω¶",
      "‡Ω° ‡Ω∫",
      "‡Ωë‡Ωñ ‡ΩÑ",
      "‡ΩÜ ‡Ω∫",
      "‡Ωñ‡Ω¢ ‡æ©",
      "‡ΩÇ ‡Ωü",
      "‡Ω†‡Ωë‡æ≤ ‡Ω¶",
      "‡Ωò ‡Ωº‡ΩÑ‡Ω¶",
      "‡Ω† ‡Ωº",
      "‡Ωò‡Ωê ‡Ωº‡ΩÑ",
      "‡æ± ‡ΩÑ",
      "‡Ω¶‡æê‡æ± ‡Ω∫‡Ω¶",
      "‡ΩÇ‡æ±‡Ω≤ ‡Ω¶",
      "‡Ωê ‡Ωº‡Ωñ",
      "‡Ωë ‡ΩÄ",
      "‡Ωè ‡Ω¥",
      "‡Ω∫ ‡Ω£",
      "‡Ωë‡æ≤ ‡Ωì",
      "‡Ω¶‡æí ‡Ωº‡Ωò",
      "‡Ω¢‡æí‡æ±‡Ω¥ ‡Ωë",
      "‡Ω¶ ‡Ωº‡ΩÇ‡Ω¶",
      "‡Ω£ ‡Ω¥‡ΩÇ‡Ω¶",
      "‡Ωï ‡Ω¢",
      "‡Ω¢‡æü ‡Ωº‡ΩÇ",
      "‡Ω¢ ‡æ´",
      "‡Ω¢‡æí‡æ± ‡Ω£",
      "‡Ωñ‡æ± ‡Ωò‡Ω¶",
      "‡Ω¶‡æ§ ‡æ±",
      "‡Ω¶ ‡æ®",
      "‡Ωô ‡Ωò",
      "‡Ωö ‡ΩÑ",
      "‡ΩÅ ‡æ≤",
      "‡Ω¢ ‡Ωñ",
      "‡Ω£ ‡Ωº",
      "‡ΩÄ ‡æ±‡ΩÑ",
      "‡ΩÇ‡Ω¶ ‡Ω£",
      "‡Ωè ‡Ωì",
      "‡ΩÑ ‡Ωº",
      "‡Ω¶‡æ§ ‡æ≤",
      "‡Ωê ‡Ωº‡ΩÇ",
      "‡Ω£ ‡Ωñ",
      "‡Ω£ ‡Ω∫",
      "‡Ωö ‡Ω∫",
      "‡Ω¶‡æ£ ‡ΩÑ",
      "‡Ωò ‡Ω¢",
      "‡Ω¶‡æ° ‡ΩÑ",
      "‡Ω¢‡æü ‡Ω∫‡Ωì",
      "‡Ω¢ ‡Ω∫‡Ω¶",
      "‡Ωô ‡ΩÑ",
      "‡Ω£ ‡æ°",
      "‡Ωö ‡Ωº‡Ω¶",
      "‡Ω° ‡ΩÇ",
      "‡Ω£‡æü ‡Ω¢",
      "‡Ωì ‡Ωò",
      "‡Ωñ ‡ΩÄ",
      "‡Ω†‡Ωë ‡Ωº‡Ωë",
      "‡ΩÇ‡æ≤ ‡Ω¥‡Ωñ",
      "‡Ωñ‡Ω¶ ‡æê‡æ±",
      "‡Ωñ‡Ω¢‡æ© ‡Ω∫",
      "‡Ωê ‡Ω¥‡ΩÇ",
      "‡Ωñ‡Ω¶ ‡æü",
      "‡Ωñ‡Ω¶ ‡æí",
      "‡Ω† ‡ΩÜ",
      "‡Ωö ‡Ωº‡ΩÇ‡Ω¶",
      "‡Ω¢ ‡æ°",
      "‡ΩÄ ‡Ω¥‡Ωì",
      "‡Ω†‡Ωñ‡æ≤ ‡Ω¶",
      "‡ΩÅ ‡Ωº‡ΩÑ",
      "‡Ω¢ ‡æ≥",
      "‡ΩÜ ‡Ω¥‡ΩÑ",
      "‡Ω† ‡ΩÇ‡æ±‡Ω¥‡Ω¢",
      "‡Ω°‡Ω≤ ‡Ωë",
      "‡ΩÇ‡Ωû ‡Ω≤",
      "‡Ω£ ‡æî",
      "‡Ω¶‡æ§‡æ± ‡Ωº‡Ωë",
      "‡Ω¢ ‡æê‡æ±",
      "‡Ω¢ ‡Ω≤‡ΩÇ",
      "‡Ω¢‡Ω≤ ‡Ωò",
      "‡Ωö ‡Ω¥‡Ω£",
      "‡Ω¢‡æü ‡Ωº‡ΩÇ‡Ω¶",
      "‡Ωö ‡Ωë",
      "‡Ω§ ‡Ω≤",
      "‡Ω¢‡Ω≤ ‡ΩÇ‡Ω¶",
      "‡Ωò ‡Ωº",
      "‡Ωö ‡Ω¢",
      "‡ΩÇ ‡Ωâ",
      "‡ΩÇ ‡Ωì",
      "‡æ±‡Ω≤ ‡Ωì",
      "‡Ω£ ‡ΩÇ",
      "‡ΩÇ‡Ωè ‡Ωº‡ΩÇ‡Ω¶",
      "‡ΩÇ ‡Ω£",
      "‡æ± ‡Ωº‡ΩÑ",
      "‡Ωñ‡Ωû ‡Ω≤",
      "‡Ω£‡æ° ‡Ωì",
      "‡ΩÜ ‡Ω¥",
      "‡Ω¶‡æü ‡Ω∫",
      "‡Ωò‡Ωö ‡Ωì",
      "‡Ωö ‡Ω¥‡Ω¢",
      "‡ΩÇ ‡æ≥",
      "‡Ω≤ ‡ΩÇ‡Ω¶",
      "‡Ωû ‡Ω∫‡Ω¶",
      "‡Ωò ‡Ω†‡Ω≤",
      "‡Ωë ‡Ωº",
      "‡ΩÇ‡Ωü ‡Ω¥‡ΩÇ‡Ω¶",
      "‡Ωë‡Ω∫ ‡Ω¶",
      "‡Ωñ‡Ω¶‡æê‡æ± ‡Ω∫‡Ωë",
      "‡Ω¢ ‡Ω≤‡ΩÑ",
      "‡Ω∫ ‡Ωñ‡Ω¶",
      "‡Ω≤ ‡Ωë",
      "‡Ω¢‡æ£ ‡Ωò‡Ω¶",
      "‡Ω° ‡Ω¥‡Ω£",
      "‡Ω† ‡Ωï",
      "‡Ω¶‡æê ‡Ωë",
      "‡Ω°‡Ω≤ ‡Ω¶",
      "‡Ω†‡Ωë‡Ω≤ ‡Ω†‡Ω≤",
      "‡Ω† ‡Ωñ‡æ±‡Ω¥‡ΩÑ",
      "‡Ω¶‡æí‡æ≤ ‡Ω¥‡Ωñ",
      "‡Ωî‡Ωº ‡Ω†‡Ω≤",
      "‡Ωò‡ΩÜ ‡Ωº‡ΩÇ",
      "‡Ωö ‡Ω≤‡ΩÇ",
      "‡Ωë‡æ≤ ‡Ω¥‡ΩÇ",
      "‡Ω¢‡æ´ ‡Ωº‡ΩÇ‡Ω¶",
      "‡Ω¶ ‡ΩÑ",
      "‡Ω†‡Ωñ‡æ≤ ‡Ω∫‡Ω£",
      "‡Ωï‡æ± ‡ΩÇ",
      "‡Ω¢‡æó ‡Ω∫‡Ω¶",
      "‡Ω†‡Ωá ‡Ω≤‡ΩÇ",
      "‡Ωö‡Ωº ‡Ω†‡Ω≤",
      "‡ΩÅ‡æ± ‡Ωº‡Ωë",
      "‡Ω§ ‡Ω¶",
      "‡Ω¶‡æî ‡ΩÇ‡Ω¶",
      "‡Ωñ‡Ω¢‡æí‡æ± ‡Ωñ",
      "‡ΩÇ ‡Ωô",
      "‡Ωë ‡Ωë",
      "‡ΩÇ‡Ωâ ‡Ω∫‡Ωì",
      "‡Ωò ‡Ω≤‡Ωì",
      "‡ΩÅ ‡Ωò‡Ω¶",
      "‡ΩÇ‡æ≤ ‡Ωº‡ΩÇ‡Ω¶",
      "‡Ω§ ‡Ω¢",
      "‡ΩÇ‡Ωè ‡Ωº‡ΩÑ",
      "‡Ωû ‡Ω≤‡ΩÑ",
      "‡ΩÑ ‡Ωì",
      "‡Ωê ‡Ω¥‡ΩÇ‡Ω¶",
      "‡Ωê ‡Ωñ‡Ω¶",
      "‡Ω†‡Ωá ‡Ω¥‡ΩÇ",
      "‡Ω†‡Ωë ‡Ω¶",
      "‡Ω¶‡æ≥ ‡Ωº‡Ωñ",
      "‡Ωò‡ΩÅ ‡Ωì",
      "‡Ω¶‡æ§ ‡æ±‡Ω≤",
      "‡Ωò ‡Ω∫",
      "‡ΩÇ‡Ωô ‡Ωº",
      "‡Ωñ ‡Ω£‡æü",
      "‡Ωö ‡Ωº‡Ω¢",
      "‡Ω¶‡æê‡æ± ‡Ωº‡Ωì",
      "‡Ωê ‡ΩÇ",
      "‡Ω¶‡æí ‡Ωº",
      "‡Ω¢‡æí‡æ±‡Ω¥ ‡Ωì",
      "‡ΩÅ‡æ± ‡Ωñ",
      "‡Ωê ‡Ω∫",
      "‡ΩÅ‡æ± ‡Ωë",
      "‡ΩÇ‡Ωì ‡Ωº‡Ωë",
      "‡Ωò ‡Ωâ",
      "‡Ω° ‡Ω¢",
      "‡Ωñ‡Ω¶‡æü ‡Ωì",
      "‡Ωñ ‡Ωô",
      "‡Ωï‡æ± ‡Ωº‡ΩÇ‡Ω¶",
      "‡Ωè ‡Ωº‡ΩÇ",
      "‡Ω¢ ‡Ω¶",
      "‡ΩÇ ‡Ωë",
      "‡Ωò‡ΩÅ ‡Ω†",
      "‡Ωë‡ΩÄ ‡Ω†",
      "‡Ωò‡Ωê ‡Ω†",
      "‡Ω¢‡æü ‡ΩÇ",
      "‡ΩÇ ‡Ω∫",
      "‡Ω¢‡æí‡æ± ‡ΩÇ",
      "‡ΩÇ ‡Ω§",
      "‡Ωï‡æ±‡Ω≤ ‡Ωì",
      "‡Ωñ‡Ωë‡Ω∫ ‡Ωì",
      "‡Ω† ‡Ωï‡æ≤",
      "‡Ωò‡Ωâ ‡Ωò",
      "‡Ω¶‡æê‡æ± ‡Ωñ‡Ω¶",
      "‡Ωû ‡Ω≤",
      "‡Ω†‡ΩÅ ‡æ≤",
      "‡Ωñ‡Ωü ‡Ω¥‡ΩÑ",
      "‡ΩÜ ‡Ωë",
      "‡ΩÇ ‡Ω¥",
      "‡Ω¢‡æ≥ ‡Ω¥‡ΩÑ",
      "‡Ω¶‡æî ‡Ωº‡Ωì",
      "‡Ωñ‡Ωü ‡ΩÑ",
      "‡Ω† ‡ΩÑ",
      "‡Ω¢‡æ° ‡Ωº",
      "‡Ωò‡Ω≤ ‡ΩÇ‡Ω¶",
      "‡Ωë ‡Ωò‡Ω≤‡ΩÇ‡Ω¶",
      "‡Ωñ‡Ω¶ ‡Ωò‡Ω¶",
      "‡ΩÑ ‡Ω∫‡Ω¶",
      "‡Ωñ‡æ≤ ‡Ω£",
      "‡Ωò‡ΩÜ ‡Ωº‡Ωë",
      "‡Ωò ‡Ωë",
      "‡Ω¶‡æ≤ ‡Ω≤‡Ωë",
      "‡ΩÇ‡æ≤ ‡Ωº‡Ω£",
      "‡Ωñ‡ΩÖ ‡Ω¥",
      "‡Ωò‡Ωê ‡Ω¢",
      "‡ΩÑ ‡ΩÇ",
      "‡Ω†‡ΩÜ ‡Ω≤",
      "‡Ωï‡æ±‡Ω≤ ‡Ω¢",
      "‡Ω≤ ‡Ω¶",
      "‡Ω¢ ‡Ω≤‡Ωì",
      "‡Ω† ‡Ωº‡Ωë",
      "‡ΩÅ‡æ± ‡Ω∫‡Ωë",
      "‡Ωñ‡ΩÄ ‡Ω†",
      "‡Ωë‡ΩÇ ‡Ω†",
      "‡Ω† ‡ΩÇ",
      "‡Ω¶‡æí ‡ΩÑ",
      "‡Ω† ‡Ωº‡ΩÑ",
      "‡Ωò ‡æ±‡Ωº‡ΩÑ",
      "‡Ω¶‡æ¶ ‡æ±",
      "‡Ωê ‡Ω¢",
      "‡Ω∫ ‡ΩÇ‡Ω¶",
      "‡Ωº ‡Ωñ‡Ω¶",
      "‡Ωè ‡Ω∫",
      "‡Ωö ‡Ωº‡Ωë",
      "‡Ωò‡Ωö ‡Ωò‡Ω¶",
      "‡Ω¶‡æ¶ ‡æ±‡Ω≤‡Ωì",
      "‡ΩÑ ‡Ω∫",
      "‡Ω£ ‡Ω¥‡ΩÑ",
      "‡Ω¢‡æê‡æ± ‡Ω∫‡Ωì",
      "‡Ω¶‡æü ‡ΩÑ‡Ω¶",
      "‡Ωñ‡Ω¶‡æí ‡æ≤",
      "‡Ω† ‡Ωñ‡æ±",
      "‡Ωñ ‡ΩÇ",
      "‡Ω¢ ‡Ω¥",
      "‡Ω¶‡æ≥ ‡Ω∫‡Ωñ‡Ω¶",
      "‡ΩÑ ‡Ωº‡Ω¶",
      "‡Ω£‡æ∑ ‡ΩÇ",
      "‡Ω¶‡æ® ‡Ωº‡Ωì",
      "‡Ωá ‡Ω≤",
      "‡Ωñ ‡Ω§‡Ω∫‡Ω¶",
      "‡Ωü ‡ΩÇ",
      "‡Ωò ‡Ωõ",
      "‡Ω£ ‡Ωº‡ΩÑ‡Ω¶",
      "‡Ω§ ‡Ωº‡ΩÇ",
      "‡Ωë ‡Ωñ‡Ω¥",
      "‡ΩÇ‡æ≥ ‡Ω≤‡ΩÑ",
      "‡ΩÖ ‡Ω≤",
      "‡Ωë‡Ωî ‡æ±",
      "‡Ωñ‡Ω¶ ‡ΩÇ‡Ω¶",
      "‡Ω†‡Ωñ‡æ± ‡Ωº‡Ω¢",
      "‡Ω¶‡æü‡Ω∫ ‡ΩÑ",
      "‡ΩÇ ‡ΩÖ",
      "‡ΩÜ ‡Ωº‡ΩÇ",
      "‡Ω£ ‡Ωº‡ΩÇ",
      "‡ΩÇ‡Ω¶ ‡ΩÑ",
      "‡ΩÅ‡æ≤ ‡Ω≤",
      "‡Ωë‡Ω∫ ‡Ω¢",
      "‡Ωî ‡Ω¥",
      "‡ΩÅ ‡ΩÑ",
      "‡Ωô ‡Ω≤",
      "‡Ω†‡ΩÇ‡æ≤ ‡Ω≤‡ΩÇ",
      "‡Ω¶‡æ° ‡Ωº‡Ωò",
      "‡Ωâ ‡Ω≤‡Ωì",
      "‡Ωñ‡Ω£‡æü ‡Ω¶",
      "‡Ωü ‡Ωº‡Ω£",
      "‡Ωò ‡æ±‡Ω≤",
      "‡ΩÇ‡æ≤ ‡ΩÑ‡Ω¶",
      "‡Ωñ‡Ω¢ ‡æó",
      "‡Ω¶‡æü ‡Ωº‡Ωì",
      "‡Ω¢ ‡Ωº‡Ω£",
      "‡Ω¢ ‡æ®",
      "‡Ωë‡Ω¥ ‡Ωì",
      "‡Ωñ‡Ωü ‡Ωº",
      "‡ΩÇ‡Ω§ ‡Ω∫‡ΩÇ‡Ω¶",
      "‡Ω¶‡æ§‡æ≤ ‡Ω¥‡Ω£",
      "‡ΩÅ ‡ΩÇ",
      "‡Ωï ‡Ω¥‡ΩÑ",
      "‡Ω¢‡æê‡æ± ‡ΩÑ",
      "‡Ω¶‡æ≤ ‡Ωº‡ΩÇ",
      "‡Ωñ‡Ω¶ ‡æ≥",
      "‡ΩÇ‡Ωè ‡ΩÑ",
      "‡Ω¢ ‡æí",
      "‡Ωñ‡Ω¥ ‡Ω†‡Ω≤",
      "‡ΩÇ ‡Ωº‡ΩÑ",
      "‡Ω† ‡ΩÅ‡æ±",
      "‡ΩÖ ‡ΩÑ",
      "‡ΩÄ‡Ω≤ ‡Ω¶",
      "‡Ωë‡æ≤ ‡ΩÇ",
      "‡Ωß ‡Ω±",
      "‡Ω¶‡æ° ‡Ω≤‡ΩÇ",
      "‡Ωñ‡ΩÖ ‡Ω¶",
      "‡Ωü ‡Ωë",
      "‡Ω¶‡æü ‡Ωº‡Ωñ‡Ω¶",
      "‡ΩÄ ‡æ≥",
      "‡ΩÜ ‡Ω∫‡Ω¶",
      "‡Ωë ‡Ωñ‡æ±",
      "‡Ω¢ ‡æô",
      "‡ΩÇ‡Ω¶ ‡Ωº‡Ω£",
      "‡Ωò‡Ωë ‡Ωº‡Ω¢",
      "‡Ω¢‡æü ‡ΩÇ‡Ω¶",
      "‡Ω§ ‡ΩÇ",
      "‡Ω¢‡æí‡æ± ‡Ωº",
      "‡Ωë‡ΩÄ ‡Ωº‡Ωì",
      "‡Ω†‡Ωõ ‡Ωò",
      "‡Ωò‡ΩÑ ‡Ωº‡Ωì",
      "‡Ω¶‡æê ‡Ωº‡Ω¢",
      "‡Ωû ‡Ω¥",
      "‡ΩÇ ‡Ωì‡ΩÑ",
      "‡Ωò‡Ωê ‡Ω¥‡Ωì",
      "‡Ω¶‡Ωº ‡Ω†‡Ω≤",
      "‡ΩÅ‡æ≤ ‡Ωº",
      "‡æ±‡Ω≤ ‡Ωë",
      "‡Ωß‡Ω± ‡Ω¥",
      "‡Ωñ‡Ω¶ ‡Ωº‡Ωë",
      "‡Ωñ‡ΩÖ ‡Ωº‡Ωò",
      "‡Ωë‡Ω¥ ‡Ωë",
      "‡Ω® ‡Ω¥‡Ωò",
      "‡Ωñ ‡æ±‡Ω≤‡Ωì",
      "‡Ωº ‡Ωò‡Ω¶",
      "‡Ωñ‡Ωè ‡Ωñ",
      "‡Ωñ‡Ω¢‡æü ‡ΩÇ",
      "‡Ωû ‡Ω¥‡Ω¶",
      "‡Ωñ‡Ωü ‡Ωº‡Ωë",
      "‡Ωò ‡ΩÇ",
      "‡Ω¢ ‡æ´‡Ω¶",
      "‡Ω§ ‡Ω≤‡ΩÑ",
      "‡Ωì ‡Ω¥‡Ω¶",
      "‡Ωï ‡Ω£",
      "‡Ω¶‡æê ‡æ±‡Ω≤‡Ωë",
      "‡Ω¶‡æ¶ ‡æ±‡Ωº‡ΩÑ",
      "‡Ω†‡Ωë ‡Ω¥‡Ω¶",
      "‡Ωì ‡Ω†‡ΩÑ",
      "‡Ωë‡ΩÇ ‡æ≤",
      "‡Ω¢ ‡Ωº",
      "‡Ωì ‡Ωº‡Ω¢",
      "‡Ω¥‡ΩÑ ‡Ω¶",
      "‡Ωò‡Ωö ‡Ωº‡Ωì",
      "‡Ωñ‡Ω¢‡æí‡æ± ‡Ωë",
      "‡Ωì ‡Ωò‡Ω¶",
      "‡Ω¶‡æ° ‡Ωº‡Ωë",
      "‡Ωò ‡ΩÇ‡Ωº",
      "‡Ωñ‡Ω¶‡æí ‡Ωº‡Ωò",
      "‡Ωï‡æ≤ ‡ΩÇ",
      "‡Ωò ‡Ω≤‡ΩÇ",
      "‡Ωñ‡Ωè ‡ΩÇ‡Ω¶",
      "‡Ω¶‡æ≤ ‡Ω¶",
      "‡Ωë‡Ωî‡æ± ‡Ωë",
      "‡ΩÇ ‡Ω°",
      "‡ΩÜ ‡Ω∫‡Ω¢",
      "‡Ω¶‡æ§‡æ±‡Ω≤ ‡Ω¢",
      "‡Ω† ‡Ωñ",
      "‡Ωë‡æ≤ ‡Ω≤",
      "‡Ω£ ‡Ωº‡ΩÇ‡Ω¶",
      "‡Ω† ‡Ωö",
      "‡Ωò ‡ΩÅ‡æ±",
      "‡Ω§ ‡Ω¥‡ΩÇ‡Ω¶",
      "‡Ω† ‡Ωñ‡Ω¥",
      "‡Ωê‡Ω∫ ‡ΩÇ",
      "‡Ωò‡Ω≤ ‡ΩÑ",
      "‡Ωò‡ΩÅ‡æ± ‡Ω∫‡Ωì",
      "‡Ωû ‡Ω∫‡Ωì",
      "‡ΩÇ ‡Ω¥‡Ω¶",
      "‡ΩÇ‡Ωâ‡Ω≤ ‡Ωë",
      "‡Ω¶‡æ® ‡Ω≤‡Ωì",
      "‡Ω¢‡æ≥ ‡Ωñ‡Ω¶",
      "‡Ωñ‡Ω¢‡æó ‡Ωº‡Ωë",
      "‡Ωò‡ΩÇ ‡Ωº‡Ωì",
      "‡Ωï ‡Ω¥‡Ω£",
      "‡Ωñ ‡Ωë‡Ω¥‡Ωì",
      "‡Ω¶ ‡æí‡æ±",
      "‡Ω¢ ‡Ω¥‡ΩÑ",
      "‡Ω¶ ‡Ω¥‡Ωò",
      "‡Ωë ‡Ω±",
      "‡Ωñ‡Ω¶‡æ° ‡Ω¥‡Ω¶",
      "‡ºç ‡ºç",
      "‡Ω† ‡Ωê",
      "‡ΩÇ‡Ωü ‡Ω≤‡ΩÇ‡Ω¶",
      "‡ΩÜ ‡Ω∫‡Ωë",
      "‡Ωñ‡æ≥ ‡ΩÑ‡Ω¶",
      "‡æ± ‡Ω£",
      "‡Ω£ ‡ΩÇ‡Ω¶",
      "‡Ωü ‡Ω≤‡Ωì",
      "‡Ω†‡Ωá ‡Ω≤‡ΩÇ‡Ω¶",
      "‡Ωë‡Ωò ‡æ±‡Ω£",
      "‡Ωü ‡æ≥",
      "‡Ω≤ ‡Ωñ",
      "‡Ωñ ‡Ωº‡Ωë",
      "‡ΩÅ‡æ≤‡Ω≤ ‡Ωò‡Ω¶",
      "‡Ω†‡Ωë ‡Ω¥‡Ωì",
      "‡ΩÇ ‡Ω¢",
      "‡Ωâ ‡Ω∫‡Ω¶",
      "‡ΩÇ‡Ωè ‡Ωì",
      "‡Ωò ‡Ωë‡Ωº",
      "‡Ωò‡ΩÅ ‡Ω¶",
      "‡Ω¢ ‡Ωº‡ΩÇ‡Ω¶",
      "‡Ωñ ‡Ω¢‡æí‡æ±‡Ω¥‡Ωë",
      "‡Ωñ‡Ω¶ ‡æê",
      "‡Ωñ‡Ω¶‡æí‡æ≤ ‡Ω¥‡Ωñ",
      "‡Ωë‡ΩÄ ‡Ω¢",
      "‡Ω†‡Ωë ‡Ω¥",
      "‡ΩÇ‡Ωû ‡Ω¥‡ΩÑ",
      "‡Ωü ‡Ω¥‡ΩÑ",
      "‡ΩÇ ‡Ωô‡ΩÑ",
      "‡Ω¢‡æ£ ‡Ω£",
      "‡Ω¶‡æ§‡æ≤ ‡Ωë",
      "‡Ω¶‡æ§ ‡ΩÑ",
      "‡Ωñ‡Ωû ‡Ω¥‡ΩÇ‡Ω¶",
      "‡Ωñ ‡ΩÖ‡Ωë",
      "‡ΩÜ ‡Ωº‡Ωë",
      "‡Ωë‡ΩÇ ‡Ωº‡ΩÑ‡Ω¶",
      "‡Ω≤ ‡Ωò",
      "‡Ωë ‡Ωº‡ΩÇ",
      "‡Ωò‡Ωõ ‡Ωë",
      "‡Ωâ ‡Ωì",
      "‡Ωê ‡Ωº‡Ω¶",
      "‡Ωë ‡Ω¥‡ΩÇ",
      "‡Ωë ‡æ≠",
      "‡Ωö‡Ωº ‡Ωò",
      "‡Ωë‡Ωî ‡ΩÇ",
      "‡Ω† ‡Ωë‡Ω∫",
      "‡Ω¶‡æí‡æ≤ ‡Ω≤‡Ωñ",
      "‡ΩÑ ‡Ω£",
      "‡Ω† ‡Ωº‡ΩÇ",
      "‡Ω†‡ΩÅ‡æ≤ ‡Ω¥‡Ω£",
      "‡Ω¶ ‡Ωº‡Ω¶",
      "‡Ω¢‡æô ‡Ω∫‡Ωë",
      "‡Ωñ‡Ω¶‡æü ‡Ω∫‡Ωì",
      "‡Ω¢‡æí ‡Ωì",
      "‡ΩÑ ‡ΩÑ",
      "‡Ω¶‡æ¶‡æ± ‡Ωº‡Ω¢",
      "‡Ω†‡ΩÇ‡æ≤ ‡Ω¥‡Ωñ",
      "‡Ω¶‡æí‡æ± ‡Ω¥",
      "‡Ω£ ‡Ωì",
      "‡Ωñ‡Ω¶‡æ≥ ‡Ωñ",
      "‡Ω† ‡Ωº‡ΩÑ‡Ω¶",
      "‡Ω£‡Ω∫ ‡ΩÇ‡Ω¶",
      "‡ΩÑ ‡Ω†‡Ω≤",
      "‡Ω†‡ΩÇ‡æ≤ ‡Ω∫‡Ω£",
      "‡Ω¢ ‡ΩÇ",
      "‡Ω§ ‡Ωº‡Ω¢",
      "‡Ωñ‡Ω¶ ‡æí‡æ±",
      "‡Ωò‡Ωê ‡Ωº",
      "‡Ωì ‡ΩÇ",
      "‡Ωñ‡Ω¥ ‡Ω¢",
      "‡Ω¶‡æ§‡æ± ‡Ωì",
      "‡Ωñ‡Ω¶‡æí‡æ± ‡Ω¥‡Ω¢",
      "‡Ωê ‡Ωº‡ΩÇ‡Ω¶",
      "‡Ω®‡Ωº ‡Ωæ",
      "‡Ωñ‡Ω¶‡æî ‡Ωº",
      "‡Ωò ‡Ωº‡Ω¶",
      "‡Ω¥ ‡Ωë",
      "‡Ω†‡Ωö ‡Ω£",
      "‡Ω°‡Ωº‡ΩÑ ‡Ω¶",
      "‡Ω†‡Ωë‡Ω∫ ‡Ωñ‡Ω¶",
      "‡Ω¢‡æ® ‡Ω≤",
      "‡Ω° ‡Ωì",
      "‡Ω¢ ‡æ¶",
      "‡Ωñ‡Ω¢‡æí‡æ± ‡ΩÇ",
      "‡ΩÅ ‡æ±‡Ω≤",
      "‡Ωñ‡Ωü ‡Ωº‡Ω¶",
      "‡ΩÇ‡ΩÖ ‡Ω∫‡Ω¶",
      "‡Ω¢‡æ´ ‡Ωº‡Ωñ",
      "‡Ωë‡æ≤ ‡Ω≤‡Ωì",
      "‡Ωé ‡Ω≤",
      "‡Ω¢ ‡Ωñ‡Ω¶",
      "‡Ω¢‡æ© ‡Ω∫",
      "‡Ω†‡Ωï ‡ΩÇ‡Ω¶",
      "‡Ωï ‡Ω∫‡Ωñ‡Ω¶",
      "‡Ωö ‡Ωì",
      "‡Ω†‡Ωë‡Ω≤ ‡Ω¢",
      "‡Ω® ‡Ω¥",
      "‡Ωë‡Ωî ‡Ω†",
      "‡Ω¶‡æ° ‡Ω∫",
      "‡Ωò ‡æ±",
      "‡Ω¶‡æ≥ ‡Ωº‡ΩÑ",
      "‡Ω§ ‡Ωº‡Ω¶",
      "‡Ωî ‡Ωë",
      "‡Ωû ‡Ω¥‡ΩÇ‡Ω¶",
      "‡Ω¢ ‡æê",
      "‡Ω†‡Ωá ‡Ω†",
      "‡Ωò ‡Ωì",
      "‡ΩÅ ‡Ω¶",
      "‡Ω¶‡æ≤ ‡Ω¥‡ΩÑ",
      "‡Ωì ‡Ωë",
      "‡Ωâ ‡Ω∫",
      "‡Ωë‡Ωî ‡Ω£",
      "‡ΩÇ ‡Ω±",
      "‡Ωò ‡Ω¥‡ΩÇ",
      "‡Ωò ‡Ωö‡Ωº",
      "‡ΩÇ‡æ≤ ‡ΩÇ‡Ω¶",
      "‡Ω† ‡Ω¥",
      "‡Ωñ‡Ω¢ ‡æ≥",
      "‡Ωè ‡ΩÇ",
      "‡Ω¢‡æ° ‡Ωº‡ΩÇ",
      "‡Ωê ‡Ω£",
      "‡ΩÇ‡ΩÖ ‡Ωº‡Ωë",
      "‡ΩÇ‡Ωè ‡Ω≤",
      "‡Ωñ‡Ωô ‡Ω¥‡Ωì",
      "‡Ω¶‡æ® ‡Ωì",
      "‡Ω†‡Ωï ‡ΩÑ",
      "‡Ωê ‡ΩÑ",
      "‡Ω¢‡æí‡æ± ‡Ωñ",
      "‡Ωë ‡ΩÄ‡æ±‡Ω≤",
      "‡Ωë‡ΩÄ‡æ±‡Ω≤ ‡Ω£",
      "‡Ωñ‡Ω¢‡æ© ‡Ωº‡Ωì",
      "‡Ωë ‡Ω¥‡ΩÑ",
      "‡Ωñ ‡Ωº‡Ωì",
      "‡Ω® ‡ΩÑ",
      "‡ΩÇ‡Ωû ‡Ω¥‡ΩÇ",
      "‡Ωñ‡æ± ‡Ω∫",
      "‡Ωß‡Ω±‡Ω¥ ‡æÉ",
      "‡Ωß‡Ω±‡Ω¥ ‡Ωæ",
      "‡Ωñ‡Ω¢‡æü ‡Ωì",
      "‡Ω¢‡æ¶ ‡Ωë",
      "‡Ωî‡Ωº ‡Ω¢",
      "‡Ω£‡æü ‡Ωº‡Ω¶",
      "‡Ωò ‡Ωá",
      "‡ΩÅ ‡Ωº‡Ω¶",
      "‡Ω¶‡æ¶‡æ± ‡Ω¢",
      "‡Ω†‡Ωë‡Ω≤ ‡Ω¶",
      "‡Ωñ‡Ω¶ ‡Ωë",
      "‡Ω£‡æ° ‡Ωº‡ΩÇ",
      "‡Ωü ‡Ωñ",
      "‡Ω†‡ΩÅ‡æ± ‡ΩÇ",
      "‡Ω∫ ‡Ωñ",
      "‡Ωñ‡ΩÄ ‡æ≤",
      "‡Ω∫ ‡ΩÑ",
      "‡Ω¢‡Ω≤ ‡Ω¶",
      "‡Ωê ‡Ω¥‡Ωì",
      "‡Ωî ‡Ωº‡Ω¶",
      "‡Ωñ‡Ω¶ ‡æ≤",
      "‡Ω†‡ΩÜ ‡Ω¢",
      "‡Ωñ‡ΩÖ ‡Ωº‡Ω¶",
      "‡ΩÇ‡Ωº ‡Ωò‡Ω¶",
      "‡Ω§‡Ω≤ ‡Ω¶",
      "‡ΩÅ ‡Ωñ",
      "‡ΩÇ ‡Ω°‡Ω∫",
      "‡Ωñ‡Ω¶‡æí ‡Ωº‡Ωò‡Ω¶",
      "‡Ωñ‡Ωº ‡Ω†‡Ω≤",
      "‡Ω¶‡æ§‡æ≤ ‡Ωº‡Ω¶",
      "‡Ω†‡Ωá ‡Ωº‡ΩÇ",
      "‡Ωß ‡Ω∫",
      "‡Ω†‡Ωë ‡Ω¥‡Ω£",
      "‡ΩÜ ‡Ωº",
      "‡ΩÖ ‡Ω∫‡Ω¶",
      "‡Ω¶ ‡Ωº‡Ω¢",
      "‡Ω†‡Ωñ ‡Ωë",
      "‡ΩÅ‡æ± ‡Ωº‡Ωì",
      "‡Ωû ‡Ωñ‡Ω¶",
      "‡Ω¶‡æ¶ ‡æ±‡ΩÑ",
      "‡Ω¶ ‡Ω†‡Ω≤",
      "‡ΩÇ‡æ≤ ‡æ≠",
      "‡Ωò ‡Ω£",
      "‡Ω†‡Ωï ‡Ωº",
      "‡Ω†‡ΩÇ ‡ΩÇ",
      "‡Ωö ‡Ω¥‡Ωì",
      "‡ΩÑ ‡Ω∫‡Ωë",
      "‡Ω¢‡æ© ‡Ωº‡Ω£",
      "‡Ω£ ‡ΩÑ‡Ω¶",
      "‡Ω¶‡æ§‡æ≤ ‡Ωº‡Ωë",
      "‡ΩÇ‡æ≤ ‡Ω¶",
      "‡Ω¢‡æí‡æ±‡Ω¥ ‡Ω¶",
      "‡Ω° ‡Ω¥‡Ωì",
      "‡Ω¶ ‡Ω≤",
      "‡Ω† ‡Ωö‡Ωº",
      "‡Ωâ ‡Ω¥‡ΩÑ",
      "‡ΩÄ‡æ≥ ‡Ωº‡ΩÇ",
      "‡Ω¶‡æí‡æ≤ ‡Ωº‡Ω£",
      "‡Ωè ‡Ω≤‡ΩÑ",
      "‡Ω¶‡æô ‡Ωò",
      "‡ΩÖ ‡Ω≤‡ΩÑ",
      "‡Ω†‡Ωë ‡Ωº‡Ωì",
      "‡Ωñ‡Ω¢‡æ≥ ‡ΩÇ",
      "‡Ω¶‡æê‡æ± ‡Ωº‡Ωñ",
      "‡ΩÖ ‡Ω¥",
      "‡Ω¶‡æô ‡Ωì",
      "‡Ωö ‡Ω¥‡ΩÇ‡Ω¶",
      "‡Ωë ‡Ωî‡Ωº",
      "‡Ωè ‡Ω≤‡ΩÇ",
      "‡Ωñ ‡Ω∫",
      "‡Ωë‡Ωñ ‡æ±‡Ω≤",
      "‡Ω† ‡ΩÇ‡Ω£",
      "‡Ωñ‡Ωü ‡æ≥",
      "‡Ω¶‡æ£ ‡Ωº‡Ωë",
      "‡Ω¶ ‡Ωë",
      "‡ΩÅ‡æ±‡Ω≤ ‡Ωò",
      "‡Ωñ‡Ω§ ‡ΩÇ‡Ω¶",
      "‡Ωë‡Ωî‡Ωº ‡Ωì",
      "‡Ω† ‡Ωò",
      "‡Ω¶‡Ω∫ ‡Ω£",
      "‡Ω¶‡æ® ‡æ≤",
      "‡Ωû ‡Ω£",
      "‡Ω¶‡æ≥ ‡Ω∫‡Ωñ",
      "‡Ω¶‡æê‡æ±‡Ω∫ ‡Ω£",
      "‡Ω¢‡æ° ‡Ω¥‡Ω£",
      "‡Ωë‡Ωñ‡æ± ‡Ω∫",
      "‡Ω§ ‡Ω¥",
      "‡Ω¢‡æó‡Ω∫ ‡Ω†‡Ω≤",
      "‡Ωë‡æ≠ ‡ΩÇ‡Ω¶",
      "‡Ω° ‡Ω≤‡ΩÇ",
      "‡Ωë‡Ωî‡æ± ‡Ωº‡Ωë",
      "‡ΩÅ ‡Ωº‡ΩÇ",
      "‡Ωñ ‡Ωº‡Ω¢",
      "‡ΩÅ‡æ≤ ‡Ω≤‡Ωë",
      "‡Ω†‡ΩÇ ‡Ωº‡ΩÇ",
      "‡Ω¶‡æî ‡Ω¢",
      "‡ΩÇ‡Ωë ‡Ωò‡Ω¶",
      "‡Ω¶‡æ§ ‡ΩÑ‡Ω¶",
      "‡Ωî ‡Ω∫",
      "‡Ωò ‡Ωº‡ΩÑ",
      "‡Ω¢ ‡ΩÇ‡Ω¶",
      "‡Ωë‡ΩÇ ‡Ω¥",
      "‡Ωë‡Ωñ‡æ± ‡Ω∫‡Ω¢",
      "‡Ω†‡ΩÇ‡æ≤‡Ωº ‡Ω†‡Ω≤",
      "‡Ω§ ‡Ω≤‡Ωì",
      "‡Ωë‡ΩÇ ‡ΩÇ",
      "‡ΩÇ‡Ωè ‡Ωº‡Ω¢",
      "‡Ω¶‡æê ‡Ω¥‡Ω¢",
      "‡Ωë ‡Ω£",
      "‡Ωë‡Ωñ ‡Ω∫‡Ωì",
      "‡Ωî‡Ωë ‡æ®",
      "‡ΩÇ‡Ωì ‡Ωë",
      "‡Ωë‡æ≤ ‡ΩÑ",
      "‡Ω£ ‡æï",
      "‡Ωñ‡Ω¥ ‡Ωò",
      "‡Ω†‡Ωï‡æ≤ ‡Ωº",
      "‡Ω†‡Ωï ‡Ω∫‡Ω£",
      "‡Ω†‡Ωñ‡Ω¥ ‡Ω£",
      "‡ΩÇ‡Ω¶ ‡Ω¢",
      "‡Ω¢‡æí‡æ± ‡Ωì",
      "‡Ωñ‡Ω¶ ‡æô",
      "‡ºã ‡ºã",
      "‡ΩÇ‡Ω≤ ‡Ωì",
      "‡Ωï ‡Ωº",
      "‡Ω° ‡Ωñ",
      "‡Ω≤ ‡Ω£",
      "‡ΩÖ ‡ΩÇ",
      "‡Ω£‡æü ‡Ω†‡Ω≤",
      "‡Ω¶ ‡Ω∫‡Ω¢",
      "‡Ω†‡Ωï‡æ≤ ‡Ωë",
      "‡Ωñ‡ΩÄ ‡Ωº‡Ωë",
      "‡Ωñ‡Ωô ‡Ω¥‡ΩÇ‡Ω¶",
      "‡Ωò ‡Ωë‡Ω¥‡Ωì",
      "‡ΩÇ‡Ωü ‡Ω¥‡ΩÑ",
      "‡Ωñ‡æ≥ ‡ΩÑ",
      "‡Ωê ‡Ω≤‡Ωò",
      "‡Ωï ‡Ω¥‡Ωì",
      "‡ΩÇ‡Ω≤ ‡Ω¢",
      "‡Ω¶‡æí‡æ≤ ‡Ω≤‡ΩÇ",
      "‡Ωñ‡Ω¢ ‡æ°",
      "‡Ω¶‡æê ‡æ≤",
      "‡Ω° ‡Ω¶",
      "‡Ω¶‡æ§ ‡Ωº‡ΩÑ",
      "‡Ωò ‡Ω¥",
      "‡ΩÇ‡æ≤ ‡Ω¥",
      "‡Ω†‡ΩÅ‡æ± ‡Ω∫‡Ω¢",
      "‡Ωê ‡Ω≤‡ΩÇ",
      "‡Ω¢ ‡Ωº‡Ωò",
      "‡ΩÇ‡Ω¶ ‡Ωº",
      "‡Ωò‡Ωë ‡Ωº‡ΩÇ",
      "‡Ωê ‡Ω¥‡ΩÑ",
      "‡Ω¶ ‡Ω¥‡Ω¶",
      "‡Ωñ ‡Ω±",
      "‡Ω¢‡æ´ ‡Ωº",
      "‡Ωë‡Ωñ‡æ±‡Ω≤ ‡ΩÑ‡Ω¶",
      "‡Ω†‡ΩÅ‡æ≤ ‡Ω¥‡ΩÇ",
      "‡Ω¢‡æ© ‡Ω≤",
      "‡Ω†‡ΩÇ‡æ≤ ‡Ω¥‡Ω¶",
      "‡Ωë‡Ωò ‡Ω¢",
      "‡ΩÄ ‡æ±",
      "‡Ωñ‡æ≥‡Ωº ‡Ω†‡Ω≤",
      "‡Ωì ‡Ωº",
      "‡ΩÅ ‡Ω¥‡ΩÇ",
      "‡ΩÖ ‡Ω¥‡ΩÑ",
      "‡ΩÇ‡Ωë ‡Ω†",
      "‡Ωò‡ΩÅ ‡Ω†‡Ω≤",
      "‡ΩÇ‡Ωû ‡ΩÇ",
      "‡ΩÅ ‡Ω∫",
      "‡Ω¶‡æê‡æ± ‡Ω∫‡Ωë",
      "‡Ω° ‡Ω¥‡Ωò",
      "‡ΩÅ‡æ± ‡Ω∫‡Ω¢",
      "‡Ωñ‡Ω¶ ‡ΩÇ",
      "‡ΩÇ‡æ≥ ‡Ωº",
      "‡Ω£ ‡ΩÑ",
      "‡Ωë‡æ≤ ‡ΩÇ‡Ω¶",
      "‡Ω£‡æ∑ ‡Ωº‡Ωë",
      "‡Ω£ ‡Ωº‡ΩÑ",
      "‡Ωñ‡ΩÄ ‡ΩÇ",
      "‡ΩÜ ‡ΩÇ",
      "‡Ω¢‡æê ‡ΩÑ",
      "‡Ωñ ‡Ωë‡Ω¥‡Ωë",
      "‡ΩÇ‡Ω°‡Ω∫ ‡ΩÑ",
      "‡Ωñ‡Ω¶‡æô ‡Ω∫‡Ωì",
      "‡Ω¥ ‡Ωñ‡Ω¶",
      "‡Ωò‡Ωá ‡Ω£",
      "‡Ωì ‡Ω∫",
      "‡Ω£‡æü ‡Ω∫",
      "‡Ωë ‡Ωº‡ΩÇ‡Ω¶",
      "‡Ω†‡ΩÜ ‡ΩÑ",
      "‡Ωö ‡Ωñ",
      "‡Ωñ‡Ω¶‡æê‡æ± ‡Ω¢",
      "‡Ωï‡æ± ‡Ω∫",
      "‡Ω§ ‡Ω≤‡ΩÇ",
      "‡Ωò‡Ω≤ ‡Ω†‡Ω≤",
      "‡Ωâ ‡Ω∫‡Ωì",
      "‡Ωë‡Ω¥ ‡Ω£",
      "‡ΩÇ‡Ω¶ ‡Ωº‡Ωë",
      "‡Ω¢‡æô ‡Ω≤‡ΩÑ",
      "‡Ωë ‡Ωî‡Ω†‡Ω≤",
      "‡ΩÇ‡Ω¶ ‡Ω∫‡Ω¢",
      "‡Ωë ‡æ∑",
      "‡Ω¶‡æê‡æ± ‡Ωº‡ΩÑ",
      "‡Ω¶‡æ¶‡æ± ‡ΩÑ‡Ω¶",
      "‡Ωë ‡Ωº‡Ω¢",
      "‡ΩÄ‡æ≥ ‡Ω¥",
      "‡Ωñ‡æ≥ ‡Ωº‡Ω¶",
      "‡Ωê ‡Ωº‡Ωì",
      "‡Ωû‡Ω≤ ‡Ωñ",
      "‡ΩÇ ‡Ωº‡Ω¶",
      "‡Ωï‡æ≤ ‡Ωì",
      "‡Ω†‡Ωë‡æ≤ ‡Ω∫‡Ω¶",
      "‡Ωê ‡Ωò",
      "‡Ωë ‡Ω¢",
      "‡Ωñ‡Ω¶ ‡Ω£",
      "‡ΩÇ‡Ωâ ‡Ω∫‡Ω¢",
      "‡Ω¶‡æê‡æ± ‡Ωº",
      "‡Ω¶‡æô ‡Ωë",
      "‡Ω¶‡æí‡æ≤ ‡Ωº‡Ωì",
      "‡Ω¶‡æ§‡æ± ‡Ωë",
      "‡Ωâ ‡Ω∫‡Ω¢",
      "‡Ωñ‡Ω¶‡æü ‡Ωº‡Ωë",
      "‡Ωñ‡ΩÖ‡Ω¥ ‡Ωë",
      "‡Ωë‡Ωò ‡Ωì",
      "‡ΩÇ‡Ωü ‡Ωñ",
      "‡Ωñ‡Ω¶‡æí‡æ≤ ‡Ω¥‡Ωñ‡Ω¶",
      "‡Ωñ‡æ≤ ‡ΩÇ",
      "‡Ωñ‡Ωô ‡Ω£",
      "‡ΩÇ ‡Ωì‡Ωò",
      "‡Ω† ‡ΩÇ‡æ±",
      "‡Ωñ‡Ω¢ ‡æô",
      "‡Ωñ ‡æ±‡Ω≤",
      "‡Ωâ ‡Ω£",
      "‡Ωñ‡ΩÖ ‡Ω¥‡ΩÇ",
      "‡Ωò‡ΩÜ ‡Ω∫‡Ωë",
      "‡ΩÇ ‡Ω§‡Ω≤‡Ω¶",
      "‡Ω£‡Ω∫ ‡Ω†‡Ω¥",
      "‡Ωñ‡Ωü ‡Ω†",
      "‡Ωñ‡Ωû ‡Ω∫‡Ω¶",
      "‡Ω†‡ΩÇ ‡Ω†",
      "‡Ω¶‡æî ‡Ωì",
      "‡Ωï‡æ≤ ‡Ω∫‡ΩÑ",
      "‡Ωü ‡Ω¶",
      "‡Ω†‡Ωê ‡Ω¥‡ΩÑ",
      "‡Ωë‡Ω∫ ‡ΩÑ",
      "‡Ω£‡æ∑ ‡Ωì",
      "‡Ω¢ ‡Ω¥‡Ω¶",
      "‡Ωñ‡Ω¶‡æ≥ ‡Ω¥",
      "‡ΩÇ‡æ≤ ‡Ωº‡ΩÑ",
      "‡Ωë‡Ωñ‡Ω¥ ‡ΩÇ‡Ω¶",
      "‡Ωë‡ΩÇ ‡Ωº‡Ωì",
      "‡Ωñ‡Ω¢‡æ© ‡Ω≤",
      "‡Ω†‡Ωë‡æ≤ ‡Ω∫‡Ωì",
      "‡ΩÅ‡Ωº ‡Ω†‡Ω≤",
      "‡ΩÇ‡Ω° ‡Ωº",
      "‡Ωñ ‡Ωñ‡Ω¶",
      "‡Ω¢‡æ© ‡Ωº‡Ωë",
      "‡Ω£‡æü ‡Ω¥‡ΩÑ",
      "‡Ωë‡Ωñ‡æ± ‡ΩÑ‡Ω¶",
      "‡Ω¶‡æ¶ ‡æ≤",
      "‡Ω†‡Ωá ‡Ωò",
      "‡Ωò‡Ωº ‡Ω†‡Ω≤",
      "‡Ωñ‡Ω¥ ‡Ωë",
      "‡ΩÅ ‡Ω¥‡ΩÑ‡Ω¶",
      "‡ΩÄ‡æ≥ ‡Ωº‡ΩÑ",
      "‡Ω¢‡Ω∫ ‡Ω†‡Ω≤",
      "‡Ω¢‡æ© ‡Ω£",
      "‡Ωá ‡Ωº",
      "‡ΩÜ ‡Ω¢",
      "‡Ω¶‡æê‡æ≤ ‡ΩÇ",
      "‡Ωé ‡æú",
      "‡Ω¢‡æ® ‡Ωº‡ΩÑ‡Ω¶",
      "‡ΩÇ‡Ωë ‡Ωì",
      "‡Ωñ‡æ± ‡Ω†‡Ω≤",
      "‡Ω¶‡æê‡Ω¥ ‡Ω†‡Ω≤",
      "‡Ω¢‡æí‡æ±‡Ω¥ ‡Ω†‡Ω≤",
      "‡Ωñ ‡ΩÇ‡æ≤",
      "‡Ωö ‡Ω≤‡ΩÇ‡Ω¶",
      "‡Ωñ‡Ωü‡æ≥ ‡Ω¶",
      "‡Ω¢‡æ´ ‡Ω¥‡Ωì",
      "‡ΩÅ‡æ≤ ‡ΩÇ",
      "‡Ω¶‡æ≥ ‡Ωº‡ΩÇ",
      "‡ΩÇ‡Ω¶ ‡Ωº‡ΩÇ",
      "‡Ω£ ‡Ω∫‡Ω¢",
      "‡Ω¶‡æ§‡æ≤ ‡Ω≤‡Ωì",
      "‡Ωñ‡Ω¶‡æê ‡Ω£",
      "‡ΩÑ ‡Ω¢",
      "‡ΩÑ ‡Ω¥",
      "‡Ω£ ‡Ω†‡ΩÑ",
      "‡Ωö ‡Ωº‡Ωì",
      "‡Ωî ‡Ω†‡Ωº",
      "‡Ωï ‡Ωñ",
      "‡Ω¶‡æí‡æ≤ ‡Ωº",
      "‡Ωï‡æ± ‡Ω¥‡ΩÇ",
      "‡ΩÅ ‡Ω¥‡Ω¢",
      "‡Ωë‡æ≠ ‡ΩÑ‡Ω¶",
      "‡Ω†‡ΩÇ‡æ± ‡Ωº‡Ωë",
      "‡ΩÇ‡Ωë ‡Ωº‡ΩÑ",
      "‡Ωè ‡Ω≤",
      "‡Ω† ‡Ωñ‡æ±‡Ω∫‡Ωë",
      "‡ΩÇ‡Ω¶ ‡Ωº‡Ωì",
      "‡Ωë‡ΩÇ ‡Ωº‡ΩÑ",
      "‡Ω¶‡Ω∫ ‡ΩÑ",
      "‡Ω†‡Ωë ‡Ωº‡ΩÇ‡Ω¶",
      "‡Ω§ ‡Ω±",
      "‡Ω¶‡æ° ‡Ωº‡ΩÑ",
      "‡Ωñ‡Ω¢‡æ© ‡Ω≤‡Ω¶",
      "‡Ωñ‡Ωû‡Ω≤ ‡Ω†‡Ω≤",
      "‡Ω¢‡æ© ‡Ω≤‡Ω¶",
      "‡ΩÇ‡æ≥ ‡Ωº‡ΩÇ",
      "‡Ω∫ ‡ΩÇ",
      "‡Ω® ‡Ωò",
      "‡Ω† ‡ΩÇ‡Ωº",
      "‡ΩÇ ‡Ω¥‡ΩÑ",
      "‡ΩÇ ‡Ωë‡Ω¥‡Ω£",
      "‡Ωè ‡Ω≤‡Ω¶",
      "‡Ωë‡æ≤ ‡Ωº‡Ωë",
      "‡Ω¶‡æê ‡Ω¢",
      "‡Ωñ‡æ≤ ‡Ω≤‡Ω¶",
      "‡ΩÄ ‡Ω†‡Ω≤",
      "‡Ω® ‡Ω±",
      "‡Ωë‡æ≤ ‡ΩÑ‡Ω¶",
      "‡Ω†‡Ωñ‡Ω¥ ‡Ωò",
      "‡Ω£‡æü ‡Ωº",
      "‡ΩÇ‡Ωè ‡Ω≤‡ΩÑ",
      "‡Ω£‡æï ‡ΩÇ‡Ω¶",
      "‡Ωñ‡Ω¢‡æó ‡Ω∫‡Ωë",
      "‡Ω£ ‡æó",
      "‡ΩÇ‡Ωè ‡Ωë",
      "‡Ωñ‡æ± ‡Ω†‡Ωº",
      "‡Ω† ‡Ωê‡Ωº‡Ωñ",
      "‡Ωñ‡Ω¶‡æ° ‡Ω¥"
    ]
  

In [2]:
len(tib_merge)

1047

In [5]:
import json
from pathlib import Path

# -----------------------------
# Paths
# -----------------------------
tokenizer_json_path = Path("/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer.json")
output_path = Path("/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer_added.json")

# Example Tibetan merges (your raw format with spaces)
tib_merges_raw = tib_merge
# -----------------------------
# Convert "‡Ωë   ‡Ω∫" ‚Üí ["‡Ωë", "‡Ω∫"]
# -----------------------------
tib_merges = []
for m in tib_merges_raw:
    parts = str(m).split(" ")
    if len(parts) == 2:
        tib_merges.append(parts)
    else:
        print(f"‚ö†Ô∏è Skipping invalid merge: {m}")

# -----------------------------
# Load tokenizer.json
# -----------------------------
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
    tok_data = json.load(f)

# -----------------------------
# Extend merges
# -----------------------------
tok_data["model"]["merges"] = tib_merges


print(f"‚úÖ Added  Tibetan merges. New merge count: ")

# -----------------------------
# Save new tokenizer.json
# -----------------------------
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(tok_data, f, ensure_ascii=False, indent=2)

print(f"üíæ Extended tokenizer.json written to {output_path}")


‚úÖ Added  Tibetan merges. New merge count: 
üíæ Extended tokenizer.json written to /home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer_added.json


In [None]:
import json
from pathlib import Path

# -----------------------------
# Paths
# -----------------------------
tokenizer_json_path = Path("/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer.json")
output_path = Path("/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer_added.json")

# Example Tibetan merges (your raw format with spaces)
tib_merges_raw = tib_merge
# -----------------------------
# Convert "‡Ωë   ‡Ω∫" ‚Üí ["‡Ωë", "‡Ω∫"]
# -----------------------------
tib_merges = []
for m in tib_merges_raw:
    parts = str(m).split(" ")
    if len(parts) == 2:
        tib_merges.append(parts)
    else:
        print(f"‚ö†Ô∏è Skipping invalid merge: {m}")

# -----------------------------
# Load tokenizer.json
# -----------------------------
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
    tok_data = json.load(f)

# -----------------------------
# Extend merges
# -----------------------------
merges = tok_data["model"]["merges"]

added = 0
for m in tib_merges:
    if m not in merges:
        merges.append(m)
        added += 1

print(f"‚úÖ Added {added} Tibetan merges. New merge count: {len(merges)}")

# -----------------------------
# Save new tokenizer.json
# -----------------------------
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(tok_data, f, ensure_ascii=False, indent=2)

print(f"üíæ Extended tokenizer.json written to {output_path}")


In [8]:
import os
import json
from tokenizers import Tokenizer

# Path to the saved tokenizer.json (from Script 1)
tokenizer_path = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/tokenizer_added.json"
output_dir = "/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan"
os.makedirs(output_dir, exist_ok=True)

# Load tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)

# Extract vocab (token ‚Üí id mapping)
vocab = tokenizer.get_vocab()
# sort by id (important!)
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])

with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as f:
    json.dump({token: idx for token, idx in sorted_vocab}, f, ensure_ascii=False, indent=2)

print(f"‚úÖ Saved vocab.json to {output_dir}/vocab.json")

# Extract merges
model = tokenizer.to_str()  # stringified JSON of the whole tokenizer
model_json = json.loads(model)

if "model" in model_json and "merges" in model_json["model"]:
    merges = model_json["model"]["merges"]
    with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as f:
        f.write("#version: 0.2\n")
        for merge in merges:
            f.write(" ".join(merge) + "\n")
    print(f"‚úÖ Saved merges.txt to {output_dir}/merges.txt")
else:
    print("‚ö†Ô∏è No merges found in tokenizer.json (check if it's really BPE)")


‚úÖ Saved vocab.json to /home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan/vocab.json
The OrderedVocab you are attempting to save contains holes for indices [50258, 50259, 50260, 50261, 50262, 50263, 50264, 50265, 50266, 50267, 50268, 50269, 50270, 50271, 50272, 50273, 50274, 50275, 50276, 50277, 50278, 50279, 50280, 50281, 50282, 50283, 50284, 50285, 50286, 50287, 50288, 50289, 50290, 50291, 50292, 50293, 50294, 50295, 50296, 50297, 50298, 50299, 50300, 50301, 50302, 50303, 50304, 50305, 50306, 50307, 50308, 50309, 50310, 50311, 50312, 50313, 50314, 50315, 50316, 50317, 50318, 50319, 50320, 50321, 50322, 50323, 50324, 50325, 50326, 50327, 50328, 50329, 50330, 50331, 50332, 50333, 50334, 50335, 50336, 50337, 50338, 50339, 50340, 50341, 50342, 50343, 50344, 50345, 50346, 50347, 50348, 50349, 50350, 50351, 50352, 50353, 50354, 50355, 50356, 50357, 50358, 50359, 50360, 50361, 50362, 50363, 50364, 50365, 50366, 50367, 50368, 50369, 50370, 50371, 50372, 50373, 50

In [9]:
from transformers import PreTrainedTokenizerFast

OUTPUT_DIR = "data/whisper_tokenizer_added_tibetan"
tok = PreTrainedTokenizerFast.from_pretrained(OUTPUT_DIR)
print(f"‚úÖ Successfully loaded tokenizer with {len(tok)} tokens")

‚úÖ Successfully loaded tokenizer with 53014 tokens


In [10]:
from transformers import WhisperTokenizer

# Load the base tokenizer (keeps all Whisper special tokens)
old_tokenizer = WhisperTokenizer.from_pretrained("/home/gangagyatso/Desktop/stt-bpe-trainer/data/whisper_tokenizer_added_tibetan")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'WhisperTokenizer'.


In [11]:
old_tokenizer.save_pretrained("data/whisper_tokenizer_added_tibetan_class")

('data/whisper_tokenizer_added_tibetan_class/tokenizer_config.json',
 'data/whisper_tokenizer_added_tibetan_class/special_tokens_map.json',
 'data/whisper_tokenizer_added_tibetan_class/vocab.json',
 'data/whisper_tokenizer_added_tibetan_class/merges.txt',
 'data/whisper_tokenizer_added_tibetan_class/normalizer.json',
 'data/whisper_tokenizer_added_tibetan_class/added_tokens.json')

In [12]:
test = "‡Ωñ‡æ±‡ΩÑ‡ºã‡ΩÜ‡Ω¥‡Ωñ‡ºã ‡Ω¶‡Ω∫‡Ωò‡Ω¶‡ºã ‡Ω°‡Ωº‡Ωë‡ºç"
ids = tok.encode(test)
print("IDs:", ids)
print("Tokens:", tok.convert_ids_to_tokens(ids))
print("token len:", len(ids))


IDs: [50258, 50363, 52134, 51866, 52161, 51866, 220, 52010, 51866, 220, 51979, 51868, 50257]
Tokens: [None, None, '‡Ωñ‡æ±‡ΩÑ', '‡ºã', '‡ΩÜ‡Ω¥‡Ωñ', '‡ºã', 'ƒ†', '‡Ω¶‡Ω∫‡Ωò‡Ω¶', '‡ºã', 'ƒ†', '‡Ω°‡Ωº‡Ωë', '‡ºç', '<|endoftext|>']
token len: 13
