In [None]:
from transformers import pipeline

repo = "ganga4364/whisper-small-tibetan-wylie-checkpoint-4000"
asr = pipeline(
    "automatic-speech-recognition",
    model=repo,
    device_map="auto",
    generate_kwargs={"language": "bo", "task": "transcribe"}  # ensure prompt
)
print(asr("/home/gangagyatso/Desktop/work/stt-model-document/STT_GR_0001_0010_93600_to_106200.wav"))


In [None]:
import torch
from transformers import pipeline, WhisperTokenizerFast, WhisperFeatureExtractor

repo_model = "ganga4364/Garchen_rinpoche_whisper_generic_on_wylie_checkpoint-4000"
repo_or_path_tokenizer = "/path/or/repo/with/your/custom_tokenizer"

tokenizer = WhisperTokenizerFast.from_pretrained(
    repo_or_path_tokenizer,
    language="Tibetan",
    task="transcribe"
)
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

asr = pipeline(
    "automatic-speech-recognition",
    model=repo_model,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    device=0 if torch.cuda.is_available() else -1,
    generate_kwargs={"language": "bo", "task": "transcribe"},
)
print(asr("/workspace/data/wav_16k/STT_GR_0001_0002_17400_to_21800.wav"))


In [None]:
from transformers import pipeline

MODEL_ID = "ganga4364/whisper-small-tibetan-wylie-checkpoint-4000"  # HF repo with model + custom tokenizer

asr = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_ID,
    device_map="auto",          # "cuda:0" or "cpu" if you prefer
    generate_kwargs={
        "language": "bo",       # or "Tibetan"
        "task": "transcribe",
        "num_beams": 4,
        "max_new_tokens": 225,
        "return_timestamps": True,
        # "condition_on_previous_text": False,  # often better for long chunks
    },
)

out = asr("path/to/audio_16k.wav")
print(out)


In [None]:
import time

start = time.time()
print(f"[TIMER] Inference start: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start))}")

out = asr("path/to/audio_16k.wav")  # your pipeline or model.generate() call

end = time.time()
print(f"[TIMER] Inference end  : {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end))}")
print(f"[TIMER] Elapsed (s)    : {end - start:.3f}")


In [None]:
import torch
from transformers import (
    pipeline,
    WhisperForConditionalGeneration,
    WhisperFeatureExtractor,
    WhisperTokenizerFast,   # or PreTrainedTokenizerFast if that's what you saved
)

MODEL_ID = "your-username/your-whisper-model"     # model repo (weights/config)
TOKENIZER_PATH = "/path/to/your/custom_tokenizer" # local dir or another HF repo

# 1) Load model
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID)

# 2) Load your custom tokenizer (with added tokens already saved)
tokenizer = WhisperTokenizerFast.from_pretrained(
    TOKENIZER_PATH,
    language="Tibetan",   # important so special lang tokens resolve correctly
    task="transcribe"
)

# 3) Load a feature extractor (usually from the base Whisper)
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

# 4) Build pipeline by passing tokenizer + feature_extractor explicitly
asr = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    device=0 if torch.cuda.is_available() else -1,
    generate_kwargs={
        "language": "bo",    # or "Tibetan"
        "task": "transcribe",
        "num_beams": 4,
        "max_new_tokens": 225,
        "return_timestamps": True,
    },
)

out = asr("path/to/audio_16k.wav")
print(out)


In [None]:
import time

start = time.time()
print(f"[TIMER] Inference start: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start))}")

out = asr("path/to/audio_16k.wav")  # your pipeline or model.generate() call

end = time.time()
print(f"[TIMER] Inference end  : {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end))}")
print(f"[TIMER] Elapsed (s)    : {end - start:.3f}")


In [None]:
def transcribe_batch(audio_paths, model, processor):
    """Run Whisper inference on a batch of audio files using GPU efficiently."""
    waveforms = []
    sample_rates = []

    for path in audio_paths:
        waveform, sr = torchaudio.load(path)
        waveforms.append(waveform.squeeze().numpy())   # ✅ ensure numpy array
        sample_rates.append(sr)

    # Resample individually if needed
    target_sr = 16000
    if any(sr != target_sr for sr in sample_rates):
        resampled = []
        for wf, sr in zip(waveforms, sample_rates):
            if sr != target_sr:
                wf_t = torch.tensor(wf)
                wf_t = torchaudio.transforms.Resample(sr, target_sr)(wf_t)
                resampled.append(wf_t.numpy())
            else:
                resampled.append(wf)
        waveforms = resampled

    # Processor handles padding automatically
    inputs = processor(
        waveforms,
        sampling_rate=target_sr,
        return_tensors="pt",
        padding=True
    ).to(model.device)

    # Generate in batch
    pred_ids = model.generate(
        inputs["input_features"],
        num_beams=4,
        max_length=225,
        length_penalty=1.0,
        repetition_penalty=1.5,
        no_repeat_ngram_size=3,
        early_stopping=True
    )

    return processor.batch_decode(pred_ids, skip_special_tokens=True)


def process_inference(batch):
    """Run both Tibetan + Wylie models on a batch of files (batched on GPU)."""
    batch["tibetan_transcript"] = transcribe_batch(batch["path"], model_tibetan, processor_tibetan)
    batch["wylie_transcript"] = transcribe_batch(batch["path"], model_wylie, processor_wylie)
    return batch

# -------------------------------
# Parallel init
# -------------------------------
