In [None]:
def transcribe_tibetan_audio(audio_path, checkpoint_dir="/workspace/whisper-small-v2/checkpoint-4000"):
    from transformers import WhisperForConditionalGeneration, WhisperProcessor
    import torchaudio
    import torch
    
    # Load processor and model
    processor = WhisperProcessor.from_pretrained(checkpoint_dir, language="Tibetan", task="transcribe")
    model = WhisperForConditionalGeneration.from_pretrained(checkpoint_dir)
    model.to("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load and process audio
    waveform, sr = torchaudio.load(audio_path)
    inputs = processor(waveform.squeeze(), sampling_rate=sr, return_tensors="pt").to(model.device)
    
    # Generate transcription
    pred_ids = model.generate(inputs["input_features"], num_beams=4, max_length=225)
    text = processor.tokenizer.decode(pred_ids[0], skip_special_tokens=True)
    
    return text

# Usage
result = transcribe_tibetan_audio("/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav")
print("Transcription:", result)

Transcription: ཨེ་དེ་ནས་བླ་མའི་རྣལ་འབྱོར་གྱི་སྒོ་ནས་རང་གིས་གང་ཤེས་ཤེས་ཞུ་དགོས་ཀྱི་ཡོད་རེད་ཟེར།


: 

In [None]:
from transformers import (
    WhisperForConditionalGeneration,
    WhisperTokenizerFast,
    WhisperFeatureExtractor,
    WhisperProcessor
)

model = WhisperForConditionalGeneration.from_pretrained("...")  # your trained weights
tokenizer = WhisperTokenizerFast.from_pretrained("...")         # has your added tokens
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
processor = WhisperProcessor(feature_extractor, tokenizer)

# (important if you added tokens after init)
model.resize_token_embeddings(len(tokenizer))

# 1) Save locally (same dir)
save_dir = "whisper_custom"
model.save_pretrained(save_dir)
processor.save_pretrained(save_dir)   # <-- this writes tokenizer + feature extractor
# (tokenizer.save_pretrained(save_dir) would also work, but processor is preferred)

# 2) Push the whole folder to the Hub (same repo)
model.push_to_hub("your-username/your-repo")
processor.push_to_hub("your-username/your-repo")


In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor

# Paths
checkpoint_dir = "/workspace/whisper-small-v2/checkpoint-4000"

# Load model + processor
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_dir)
processor = WhisperProcessor.from_pretrained(
    "/workspace/whisper-small-v2/checkpoint-4000", language="Tibetan", task="transcribe"
)

model.resize_token_embeddings(len(tokenizer))

model.generation_config.no_repeat_ngram_size = 2
model.generation_config.length_penalty = -1.0
model.generation_config.num_beams = 3   
# Your HF repo name (e.g., username/model_name)
repo_name = "ganga4364/whisper-small-latin-added-tibetan-checkpoint-4000"

# Push to hub
model.push_to_hub(repo_name)
processor.push_to_hub(repo_name)


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ganga4364/whisper-small-latin-added-tibetan-checkpoint-4000/commit/94d0be4ace10839a74357a73a394ace0dd092590', commit_message='Upload processor', commit_description='', oid='94d0be4ace10839a74357a73a394ace0dd092590', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ganga4364/whisper-small-latin-added-tibetan-checkpoint-4000', endpoint='https://huggingface.co', repo_type='model', repo_id='ganga4364/whisper-small-latin-added-tibetan-checkpoint-4000'), pr_revision=None, pr_num=None)

In [26]:
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration

checkpoint = "/workspace/whisper-small-v2/checkpoint-4000"

# Load processor with Tibetan setup
processor = WhisperProcessor.from_pretrained(
    checkpoint,
    language="Tibetan",
    task="transcribe"
)

# Load model
model = WhisperForConditionalGeneration.from_pretrained(checkpoint)
# Set forced_decoder_ids from processor
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="Tibetan",
    task="transcribe"
)

# Build pipeline
generator = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device=0
)

result = generator(
    "/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav",
    generate_kwargs={"num_beams": 4, "max_length": 225}
)
print(result["text"])


Device set to use cuda:0





In [None]:
from transformers import pipeline
from transformers import pipeline, WhisperProcessor

generator = pipeline(
    task="automatic-speech-recognition",
    model="/workspace/whisper-small-v2/checkpoint-4000",   
    device=0
  
)

# Example transcription
result = generator("/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav")
print(result["text"])


Device set to use cuda:0





In [18]:
import torch
import torchaudio
from transformers import WhisperForConditionalGeneration, WhisperProcessor

# -------------------------------
# 1. Load model + processor from HF
# -------------------------------
repo_name = "ganga4364/whisper-small-latin-added-tibetan-checkpoint-4000"  # your HF repo

#processor = WhisperProcessor.from_pretrained(repo_name, language="Tibetan", task="transcribe")
processor = WhisperProcessor.from_pretrained(repo_name)
model = WhisperForConditionalGeneration.from_pretrained(repo_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# -------------------------------
# 2. Load audio file
# -------------------------------
audio_path = "/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav"
waveform, sr = torchaudio.load(audio_path)

# Resample if needed
if sr != 16000:
    waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
    sr = 16000

# -------------------------------
# 3. Preprocess
# -------------------------------
inputs = processor(waveform.squeeze(), sampling_rate=sr, return_tensors="pt").to(device)

# -------------------------------
# 4. Run inference
# -------------------------------
with torch.no_grad():
    pred_ids = model.generate(
        inputs["input_features"],
        num_beams=4,
        max_length=225
    )

# Decode prediction
text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
print("Transcription:", text)


Transcription: ཨེ་དེ་ནས་བླ་མའི་རྣལ་འབྱོར་གྱི་སྒོ་ནས་རང་གིས་གང་ཤེས་ཤེས་ཞུ་དགོས་ཀྱི་ཡོད་རེད་ཟེར།


In [9]:
from transformers import WhisperForConditionalGeneration, PreTrainedTokenizerFast, WhisperFeatureExtractor, WhisperProcessor
import torchaudio, torch

# Reload processor + model from checkpoint
checkpoint_dir = "/workspace/stt-whisper/whisper-small-tibetan-wylie/checkpoint-4000"
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Tibetan", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_dir)
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Load audio
waveform, sr = torchaudio.load("/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav")

# Preprocess
inputs = processor(waveform.squeeze(), sampling_rate=sr, return_tensors="pt").to(model.device)

# Generate transcription
pred_ids = model.generate(inputs["input_features"], num_beams=4, max_length=225)
text = processor.tokenizer.decode(pred_ids[0], skip_special_tokens=True)
print("Transcription:", text)


Transcription:  e de nas bla ma'i rnam rgyor gyi sgo nas rang gi gang shes shes zhu dgos kyi yod red zer


In [24]:
from transformers import WhisperForConditionalGeneration, PreTrainedTokenizerFast, WhisperFeatureExtractor, WhisperProcessor
import torchaudio, torch

# Reload processor + model from checkpoint
checkpoint_dir = "/workspace/whisper-small-v2/checkpoint-4000"
processor = WhisperProcessor.from_pretrained("/workspace/whisper-small-v2/checkpoint-4000", language="Tibetan", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("/workspace/whisper-small-v2/checkpoint-4000")
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Load audio
waveform, sr = torchaudio.load("/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav")

# Preprocess
inputs = processor(waveform.squeeze(), sampling_rate=sr, return_tensors="pt").to(model.device)

# Generate transcription
pred_ids = model.generate(inputs["input_features"], num_beams=4, max_length=225)
text = processor.tokenizer.decode(pred_ids[0], skip_special_tokens=True)
print("Transcription:", text)


Transcription: ཨེ་དེ་ནས་བླ་མའི་རྣལ་འབྱོར་གྱི་སྒོ་ནས་རང་གིས་གང་ཤེས་ཤེས་ཞུ་དགོས་ཀྱི་ཡོད་རེད་ཟེར།


In [None]:
from transformers import WhisperForConditionalGeneration, PreTrainedTokenizerFast, WhisperFeatureExtractor, WhisperProcessor
import torchaudio, torch

# Reload processor + model from checkpoint
checkpoint_dir = "/workspace/whisper-small-v2/checkpoint-3000"
processor = WhisperProcessor.from_pretrained("/workspace/whisper-small-v2/checkpoint-3000", language="Tibetan", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("/workspace/whisper-small-v2/checkpoint-3000")
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Load audio
waveform, sr = torchaudio.load("/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav")

# Preprocess
inputs = processor(waveform.squeeze(), sampling_rate=sr, return_tensors="pt").to(model.device)

# Generate transcription
pred_ids = model.generate(inputs["input_features"], num_beams=4, max_length=225)
text = processor.tokenizer.decode(pred_ids[0], skip_special_tokens=True)
print("Transcription:", text)


In [None]:
from transformers import WhisperForConditionalGeneration, PreTrainedTokenizerFast, WhisperFeatureExtractor, WhisperProcessor
import torchaudio, torch

# Reload processor + model from checkpoint
checkpoint_dir = "/workspace/whisper-small-v2/checkpoint-3000"
processor = WhisperProcessor.from_pretrained("/workspace/whisper-small-v2/checkpoint-3000", language="Tibetan", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("/workspace/whisper-small-v2/checkpoint-3000")
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Load audio
waveform, sr = torchaudio.load("/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav")

# Preprocess
inputs = processor(waveform.squeeze(), sampling_rate=sr, return_tensors="pt").to(model.device)

# Generate transcription
pred_ids = model.generate(inputs["input_features"], num_beams=4, max_length=225)
text = processor.tokenizer.decode(pred_ids[0], skip_special_tokens=True)
print("Transcription:", text)


In [24]:
! pip install librosa

Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.7/260.7 KB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting audioread>=2.1.9
  Downloading audioread-3.0.1-py3-none-any.whl (23 kB)
Collecting numba>=0.51.0
  Downloading numba-0.62.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting scipy>=1.6.0
  Downloading scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.7/37.7 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting soxr>=0.3.2
  Downloading soxr-1.0.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (242 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [25]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
import librosa

# --- 1. Load your fine-tuned model and processor from the Hub ---
# Replace "your-username/your-fine-tuned-whisper-model" with your actual repo name.
model_id = "ganga4364/Garchen_Rinpoche-whisper_latin_tibetan_added_on_uni_Checkpoint-4000"
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load the model and send it to the desired device
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).to(device)

# Load the processor (this includes your custom tokenizer)
# This is the key step: it fetches the correct tokenizer you trained with.
processor = AutoProcessor.from_pretrained(model_id)


# --- 2. Prepare your audio file ---
# Load an audio file. Librosa will automatically resample to the target rate.
# Make sure your audio file is in a format librosa can handle (e.g., .wav, .mp3, .flac).
audio_path = "/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav"
# Whisper was trained on 16kHz audio. It's crucial to resample your audio to this rate.
speech_array, sampling_rate = librosa.load(audio_path, sr=16000)


# --- 3. Run Inference ---
# Process the raw audio to create input features
input_features = processor(
    speech_array,
    sampling_rate=16000,
    return_tensors="pt"
).input_features.to(device)

# Generate token IDs
# You can specify the language and task if your model is multilingual
# For example: forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")
predicted_ids = model.generate(input_features)


# --- 4. Decode the token IDs to text ---
# Use the processor's batch_decode method to convert token IDs back to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

print("Transcription:", transcription[0])

ValueError: Unrecognized configuration class <class 'transformers.models.wav2vec2.configuration_wav2vec2.Wav2Vec2Config'> for this kind of AutoModel: AutoModelForSpeechSeq2Seq.
Model type should be one of DiaConfig, GraniteSpeechConfig, KyutaiSpeechToTextConfig, MoonshineConfig, Pop2PianoConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SpeechEncoderDecoderConfig, Speech2TextConfig, SpeechT5Config, WhisperConfig.

In [31]:
import torch
from transformers import AutoModelForCTC, AutoProcessor
import librosa

# --- 1. Load your fine-tuned model and processor from the Hub ---
# The model ID is correct, but we need to use the right AutoClass.
model_id = "ganga4364/Garchen_Rinpoche-whisper_latin_tibetan_added_on_uni_Checkpoint-4000"
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# CORRECT: Use AutoModelForCTC for Wav2Vec2-based models.
model = AutoModelForCTC.from_pretrained(model_id).to(device)

# AutoProcessor is smart and will load the correct Wav2Vec2Processor.
processor = AutoProcessor.from_pretrained(model_id)


# --- 2. Prepare your audio file (This part was already correct) ---
audio_path = "/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav"
speech_array, sampling_rate = librosa.load(audio_path, sr=16000)


# --- 3. Run Inference (This part needs to be changed for CTC) ---
# Process the audio file.
inputs = processor(
    speech_array, 
    sampling_rate=16000, 
    return_tensors="pt"
).to(device)

# Get the logits from the model's forward pass.
# CORRECT: CTC models don't use .generate(). You get logits directly.
with torch.no_grad():
    logits = model(**inputs).logits


# --- 4. Decode the token IDs to text (This part also changes for CTC) ---
# CORRECT: Take the argmax of the logits to get the most likely token IDs.
predicted_ids = torch.argmax(logits, dim=-1)

# Decode the IDs to text using the processor.
# The Wav2Vec2 processor's batch_decode handles the CTC logic (removing blanks and repeats).
transcription = processor.batch_decode(predicted_ids)

print("Transcription:", transcription)

Transcription: ['3']


In [None]:
import torch
import torchaudio
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizerFast, WhisperFeatureExtractor

# -------------------------------
# 1. Load model + processor from HF
# -------------------------------
repo_name = "ganga4364/Garchen_Rinpoche-whisper_latin_tibetan_added_on_uni_Checkpoint-4000"  # your HF repo

# 1. Load your custom tokenizer (as WhisperTokenizerFast)
from transformers import (
    WhisperTokenizerFast,
    WhisperFeatureExtractor,
    WhisperProcessor,
    WhisperForConditionalGeneration
)

# 1. Load your custom tokenizer (as WhisperTokenizerFast)
tokenizer = WhisperTokenizerFast.from_pretrained(
    "/workspace/data/whisper_tokenizer_latin_added_tibetan",
    language="bo",   # Tibetan
    task="transcribe"
)

# 2. Load Whisper feature extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

# 3. Combine into a processor
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# 4. Load model weights from Hugging Face Hub
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

# 5. Resize embeddings if vocab changed
model.resize_token_embeddings(len(tokenizer))

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# -------------------------------
# 2. Load audio file
# -------------------------------
audio_path = "/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav"
waveform, sr = torchaudio.load(audio_path)

# Resample if needed
if sr != 16000:
    waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
    sr = 16000

# -------------------------------
# 3. Preprocess
# -------------------------------
inputs = processor(waveform.squeeze(), sampling_rate=sr, return_tensors="pt").to(device)

# -------------------------------
# 4. Run inference
# -------------------------------
with torch.no_grad():
    pred_ids = model.generate(
        inputs["input_features"],
        num_beams=4,
        max_length=225
    )

# Decode prediction
text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
print("Transcription:", text)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'WhisperTokenizerFast'.


You are using a model of type wav2vec2 to instantiate a model of type whisper. This is not supported for all configurations of models and can yield errors.
Some weights of WhisperForConditionalGeneration were not initialized from the model checkpoint at ganga4364/Garchen_Rinpoche-whisper_latin_tibetan_added_on_uni_Checkpoint-4000 and are newly initialized: ['model.decoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.layer_norm.bias', 'model.decoder.layer_norm.weight', 'model.decoder.layers.0.encoder_attn.k_proj.weight', 'model.decoder.layers.0.encoder_attn.out_proj.bias', 'model.decoder.layers.0.encoder_attn.out_proj.weight', 'model.decoder.layers.0.encoder_attn.q_proj.bias', 'model.decoder.layers.0.encoder_attn.q_proj.weight', 'model.decoder.layers.0.encoder_attn.v_proj.bias', 'model.decoder.layers.0.encoder_attn.v_proj.weight', 'model.decoder.layers.0.encoder_attn_layer_norm.bias', 'model.decoder.layers.0.encoder_attn_layer_norm.weight', 'model.decoder

Transcription: иваемиваемиваемиваемиваем trou trou trou trou trou trou disguise disguise disguise disguise disguise disguise disguise disguise et et et et et et et et et et et et et et etaltresaltresaltresaltresaltresaltres disguise disguise disguise disguise disguise disguise disguise disguise disguise disguise disguise disguise disguise disguisealtresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltres disguise disguise disguise disguise disguise disguise disguise disguise disguise disguise disguise disguisealtresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltresaltres disguise disguise disguise disguise disguise disguise disguise disguise disguise disguise disguise disguise di

In [5]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# -------------------------------
# 1. Load model + processor
# -------------------------------
# Replace with the model you want, e.g. facebook/wav2vec2-large-960h
MODEL_ID = "/workspace/stt-whisper/whisper-small-Latin-Added-Tibetan/checkpoint-5000"

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

# -------------------------------
# 2. Load audio file
# -------------------------------
filename = "/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav"   # path to your test wav file
speech, sr = torchaudio.load(filename)

# Convert to mono if stereo
if speech.shape[0] > 1:
    speech = torch.mean(speech, dim=0, keepdim=True)

# Resample to 16k if needed
if sr != 16000:
    resampler = torchaudio.transforms.Resample(sr, 16000)
    speech = resampler(speech)
    sr = 16000

# -------------------------------
# 3. Preprocess
# -------------------------------
inputs = processor(speech.squeeze().numpy(), sampling_rate=sr, return_tensors="pt", padding=True)

# -------------------------------
# 4. Run inference
# -------------------------------
with torch.no_grad():
    logits = model(inputs.input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)

# -------------------------------
# 5. Decode to text
# -------------------------------
transcription = processor.batch_decode(predicted_ids)
print("Transcription:", transcription[0])


TypeError: expected str, bytes or os.PathLike object, not NoneType

In [33]:
import torch
import librosa
from transformers import (
    WhisperForConditionalGeneration,
    WhisperTokenizerFast,
    WhisperFeatureExtractor,
    WhisperProcessor
)

# --- 1. Define All Component Paths and Device ---
# Path to the directory where your trained model weights are saved.
MODEL_PATH = "ganga4364/Garchen_Rinpoche-whisper_latin_tibetan_added_on_uni_Checkpoint-4000"

# Path to your custom tokenizer files.
TOKENIZER_PATH = "/workspace/data/whisper_tokenizer_latin_added_tibetan" 

# Path to the original model for the feature extractor.
FEATURE_EXTRACTOR_PATH = "openai/whisper-small" 

# Path to the audio file you want to transcribe.
AUDIO_FILE_PATH = "/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav"  # 👈 Replace with your audio file

# Set the device to GPU if available, otherwise CPU.
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# --- 2. Load Model and Processor Components Separately ---
print(f"Loading model from: {MODEL_PATH}")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)

print(f"Loading custom tokenizer from: {TOKENIZER_PATH}")
tokenizer = WhisperTokenizerFast.from_pretrained(TOKENIZER_PATH)

print(f"Loading feature extractor from: {FEATURE_EXTRACTOR_PATH}")
feature_extractor = WhisperFeatureExtractor.from_pretrained(FEATURE_EXTRACTOR_PATH)

# Combine the manually loaded components into a processor
print("Combining components into a WhisperProcessor...")
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# --- 3. Prepare the Audio Input (This part is the same) ---
print(f"Loading and resampling audio from: {AUDIO_FILE_PATH}")
try:
    # librosa.load will resample to 16kHz for you if you specify sr=16000
    speech_array, sampling_rate = librosa.load(AUDIO_FILE_PATH, sr=16000)
except Exception as e:
    print(f"Error loading audio file: {e}")
    exit()

# --- 4. Run Inference (This part is the same) ---
print("Running inference...")
# Process the audio array to create input features
inputs = processor(
    speech_array,
    sampling_rate=16000,
    return_tensors="pt"
).to(device)

# Set the generation configuration for Tibetan transcription
forced_decoder_ids = processor.get_decoder_prompt_ids(language="bo", task="transcribe")

# Generate the token IDs
predicted_ids = model.generate(
    inputs["input_features"],
    forced_decoder_ids=forced_decoder_ids
)

# --- 5. Decode the Output (This part is the same) ---
# Decode the generated token IDs back to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

print("\n--- Transcription ---")
print(transcription[0])

Loading model from: ganga4364/Garchen_Rinpoche-whisper_latin_tibetan_added_on_uni_Checkpoint-4000


You are using a model of type wav2vec2 to instantiate a model of type whisper. This is not supported for all configurations of models and can yield errors.


Some weights of WhisperForConditionalGeneration were not initialized from the model checkpoint at ganga4364/Garchen_Rinpoche-whisper_latin_tibetan_added_on_uni_Checkpoint-4000 and are newly initialized: ['model.decoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.layer_norm.bias', 'model.decoder.layer_norm.weight', 'model.decoder.layers.0.encoder_attn.k_proj.weight', 'model.decoder.layers.0.encoder_attn.out_proj.bias', 'model.decoder.layers.0.encoder_attn.out_proj.weight', 'model.decoder.layers.0.encoder_attn.q_proj.bias', 'model.decoder.layers.0.encoder_attn.q_proj.weight', 'model.decoder.layers.0.encoder_attn.v_proj.bias', 'model.decoder.layers.0.encoder_attn.v_proj.weight', 'model.decoder.layers.0.encoder_attn_layer_norm.bias', 'model.decoder.layers.0.encoder_attn_layer_norm.weight', 'model.decoder.layers.0.fc1.bias', 'model.decoder.layers.0.fc1.weight', 'model.decoder.layers.0.fc2.bias', 'model.decoder.layers.0.fc2.weight', 'model.decoder.layers.0.fi

Loading custom tokenizer from: /workspace/data/whisper_tokenizer_latin_added_tibetan
Loading feature extractor from: openai/whisper-small
Combining components into a WhisperProcessor...
Loading and resampling audio from: /workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav
Running inference...


ValueError: The following `model_kwargs` are not used by the model: ['forced_decoder_ids'] (note: typos in the generate arguments will also show up in this list)

In [2]:
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC

#load the model with best checkpoint
model = Wav2Vec2ForCTC.from_pretrained("/workspace/stt-whisper/whisper-small-tibetan-wylie/checkpoint-4000")
#load the base model processor since it is same for base model and finetuned model
#processor = Wav2Vec2Processor.from_pretrained("/workspace/stt-whisper/whisper-small-tibetan-wylie/checkpoint-4000")


You are using a model of type whisper to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at /workspace/stt-whisper/whisper-small-tibetan-wylie/checkpoint-4000 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.layer_norm.bias', 'wav2vec2.encoder.layer_norm.weight', 'wav2vec2.encoder.layers.0.attention.k_proj.bias', 'wav2vec2.encoder.layers.0.attention.k_proj.weight', 'wav2vec2.encoder.layers.0.attention.out_proj.bias', 'wav2vec2.encoder.layers.0.attention.out_proj.weight', 'wav2vec2.encoder.layers.0.attention.q_proj.bias', 'wav2vec2.encoder.layers.0.attention.q_proj.weight', 'wav2vec2.encoder.layers.0.attention.v_proj.bias', 'wav2vec2.encoder.layers.0.attention.v_proj.weight', 'wav2vec2.encoder.layers.0.feed_forward.intermediate_dense.bias', 'wav2vec2.encoder.layers.0.feed_forward.intermediate_dense.weight', 'wav2v

In [4]:
processor = Wav2Vec2Processor.from_pretrained("openai/whisper-small", language="Tibetan", task="transcribe")


In [None]:
# HF token removed for security


In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
model_name = "ganga4364/Garchen_rinpoche_whisper_generic_on_wylie_checkpoint-4000"
model.push_to_hub(    model_name)
processor.push_to_hub(model_name)

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ganga4364/Garchen_rinpoche_whisper_generic_on_wylie_checkpoint-4000/commit/1ea13d0e3c10243639b7206d2cda53b8b83d3eed', commit_message='Upload processor', commit_description='', oid='1ea13d0e3c10243639b7206d2cda53b8b83d3eed', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ganga4364/Garchen_rinpoche_whisper_generic_on_wylie_checkpoint-4000', endpoint='https://huggingface.co', repo_type='model', repo_id='ganga4364/Garchen_rinpoche_whisper_generic_on_wylie_checkpoint-4000'), pr_revision=None, pr_num=None)

In [1]:
# finetune_whisper.py

import os
import torch
import logging
from datasets import load_from_disk
from transformers import (
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    WhisperProcessor,
)
import wandb
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# -------------------------------
# 0. Configure Logging
# -------------------------------
# Creates a log directory and sets up logging to both a file and the console.
log_dir = "./logs"
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(
    filename=os.path.join(log_dir, "train.log"),
    filemode="a",
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO
)

# Also print logs to the console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
console.setFormatter(formatter)
logging.getLogger("").addHandler(console)

logging.info("🚀 Starting Whisper fine-tuning script")

# -------------------------------
# 1. Init W&B
# -------------------------------
# Logs into Weights & Biases and initializes a new project run.
# Make sure to replace the key or log in via the command line beforehand.
wandb.login(key="cd3943d98b7ac4e0a0abd24721f4016a7942166f")
wandb.init(
    project="stt-for-tibetan-language",
    entity="stt-for-tibet"
)

# -------------------------------
# 2. Load preprocessed dataset
# -------------------------------
# Loads the datasets you created in the previous preprocessing step.
data_dir = "/workspace/data/processed_tibetan"
train_dataset = load_from_disk(os.path.join(data_dir, "train"))
val_dataset = load_from_disk(os.path.join(data_dir, "validation"))
logging.info(f"Loaded datasets: train={len(train_dataset)}, val={len(val_dataset)}")

# -------------------------------
# 3. Load model + processor
# -------------------------------
# Loads your custom processor and the pre-trained Whisper model.

processor = WhisperProcessor.from_pretrained("/workspace/data/whisper_tibet_tokenizer")

# Explicitly set the max target positions to prevent CUDA errors
#config = WhisperConfig.from_pretrained("openai/whisper-small", max_target_positions=1024)
# ✅ Load normally, Whisper already has 448 positions built-in
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

# **IMPORTANT**: Manually re-initialize the output projection layer to match the new
# tokenizer's vocabulary size. This is a more robust way to avoid vocab size mismatches.

import torch.nn as nn

new_vocab_size = len(processor.tokenizer)
new_proj_out = nn.Linear(model.config.d_model, new_vocab_size)
model.proj_out = new_proj_out

# Also update the model's config to reflect the new vocab size
model.config.vocab_size = new_vocab_size
#model.resize_token_embeddings(len(processor.tokenizer))


logging.info(f"Tokenizer vocab size: {len(processor.tokenizer)}")
logging.info(f"Model output projection layer size: {model.proj_out.out_features}")

# Configure model for training
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False  # Disable cache for gradient checkpointing

# -------------------------------
# 4. Data collator
# -------------------------------
# This class handles padding for both audio features and text labels in each batch.
# It ensures all sequences in a batch have the same length for efficient processing.
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Pad audio inputs
        input_features = [{"input_features": f["input_features"]} for f in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Pad text labels
        label_features = [{"input_ids": f["labels"]} for f in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        
        # Replace padding token id's with -100 so they are ignored in the loss calculation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# -------------------------------
# 5. Metrics
# -------------------------------
# Defines the function to compute Word Error Rate (WER) and Character Error Rate (CER).
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 with the pad token id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute WER and CER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    logging.info(f"Eval Metrics -> WER: {wer:.4f}, CER: {cer:.4f}")
    return {"wer": wer, "cer": cer}

# -------------------------------
# 6. Training arguments
# -------------------------------
# Configures all hyperparameters and settings for the training process.
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tibetan-ft",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    save_strategy="steps",
    predict_with_generate=True,
    logging_dir="./logs/tensorboard", # This requires `tensorboard` to be installed
    logging_strategy="steps",
    logging_steps=50,
    save_steps=500,
    eval_steps=10,
    max_steps=8000,
    learning_rate=1e-5,
    warmup_steps=500,
    gradient_checkpointing=False, # Disable to resolve backward pass error
    fp16=torch.cuda.is_available(),
    report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)

# -------------------------------
# 7. Trainer
# -------------------------------
# Initializes the Seq2SeqTrainer with the model, datasets, collator, and metrics.
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

# -------------------------------
# 8. Train
# -------------------------------
# Starts the fine-tuning process.
logging.info("🔥 Starting training loop")
trainer.train()
logging.info("✅ Training complete")

# -------------------------------
# 9. Save model
# -------------------------------
# Saves the final best model and processor to a specified directory.
output_model_dir = "./whisper-tibetan-ft/checkpoint-final"
trainer.save_model(output_model_dir)
processor.save_pretrained(output_model_dir)
logging.info(f"✅ Fine-tuning complete. Model + processor saved to {output_model_dir}")

2025-09-23 14:01:23,679 - INFO - 🚀 Starting Whisper fine-tuning script
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mganga2000[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


2025-09-23 14:01:24,654 - INFO - Loaded datasets: train=3071, val=543
2025-09-23 14:01:25,096 - INFO - Tokenizer vocab size: 1259
2025-09-23 14:01:25,098 - INFO - Model output projection layer size: 1259
  trainer = Seq2SeqTrainer(
2025-09-23 14:01:26,019 - INFO - 🔥 Starting training loop
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:94: operator(): block: [0,0,0], thread: [96,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:94: operator(): block: [0,0,0], thread: [97,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:94: operator(): block: [0,0,0], thread: [98,0,0] Assertion `-sizes[i] <= ind

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [7]:
from transformers import WhisperForConditionalGeneration, PreTrainedTokenizerFast, WhisperFeatureExtractor, WhisperProcessor
import torchaudio, torch

# Reload processor + model from checkpoint
checkpoint_dir = "/workspace/stt-whisper/whisper-small-tibetan-wylie/checkpoint-4000"
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Tibetan", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_dir)
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Load audio
waveform, sr = torchaudio.load("/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav")

# Preprocess
inputs = processor(waveform.squeeze(), sampling_rate=sr, return_tensors="pt").to(model.device)

# Generate transcription
pred_ids = model.generate(inputs["input_features"], num_beams=4, max_length=225)
text = processor.tokenizer.decode(pred_ids[0], skip_special_tokens=True)
print("Transcription:", text)


`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A custom logits processor of type <class 'transformers.generation.logit

Transcription:  e de nas bla ma'i rnam rgyor gyi sgo nas rang gi gang shes shes zhu dgos kyi yod red zer
