In [None]:
import torch
import numpy as np
import librosa
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from typing import Dict, Any

In [2]:
# Load Audio with sample rate 16,000, standard for CrisperWhisper with librosa

def load_audio(path: str, target_sr: int = 16000):
    audio, sr = librosa.load(path, sr=None, mono=True)
    if sr != target_sr:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        sr = target_sr
    return audio, sr

In [3]:
# Create audio chunks of a specified length in seconds with some overlap

def chunk_audio(audio: "np.ndarray", sr: int, chunk_s: int, overlap_s: float = 0.5):
    step = int((chunk_s - overlap_s) * sr)
    chunk_len = int(chunk_s * sr)
    chunks = []
    start = 0
    while start < len(audio):
        end = min(start + chunk_len, len(audio))
        chunk = audio[start:end]
        chunks.append((chunk, start / sr))  # (samples, start_time_seconds)
        if end == len(audio):
            break
        start += step
    return chunks

In [4]:
# Format pipeline outputs into readable verbatim transcript with word level timestamps

# walk through in debug after the pipeline is implemented to eliminate unnecessary functionality
def format_verbatim_output(hf_out: Dict[str, Any]):
    # hf_out expected to include 'text' and 'chunks' where each chunk has 'timestamps' at word level
    lines = []
    for chunk in hf_out.get("chunks", []):
        chunk_start = chunk.get("timestamp", (None, None))[0]
        words = chunk.get("words") or chunk.get("timestamps") or []
        for w in words:
            # try to support both shapes
            if isinstance(w, dict):
                word_text = w.get("word") or w.get("text")
                ts = w.get("timestamp") or w.get("times") or w.get("start_end")
                if isinstance(ts, (list, tuple)) and len(ts) >= 2:  # if the pipeline stores words with timestamp tuples
                    start_ts = ts[0]
                else:
                    start_ts = None
            elif isinstance(w, (list, tuple)) and len(w) >= 3:  # [word, start, end]
                word_text = w[0]
                start_ts = w[1]
            else:  # fallback
                word_text = str(w)
                start_ts = None
            if start_ts is None:
                line = f"{word_text}"
            else:
                line = f"[{start_ts:.3f}] {word_text}"
            lines.append(line)
    return "\n".join(lines)

In [5]:
print(torch.cuda.is_available())
print(torch.__version__)

False
2.8.0+cpu


In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if "cuda" in device else torch.float32
model_id = "nyrahealth/CrisperWhisper"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype = torch_dtype,
    low_cpu_mem_usage = True,
    use_safetensors = True,
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

asr_pipe = pipeline(
    "automatic-speech-recognition",
    model = model,
    tokenizer = processor.tokenizer,
    feature_extractor = processor.feature_extractor,
    chunk_length_s = 30,  # on model card
    batch_size = 16,  # on model card
    return_timestamps = "word",
    device = 0 if "cuda" in device else -1,
)

Device set to use cpu


In [None]:
def transcribe_audio(
    audio_path: str,
    asr_pipeline,
    chunk_length_s: int = 30,
    overlap_s: float = 0.5,
) -> str:
    audio, sr = load_audio(audio_path, target_sr = 16000)
    audio_chunks = chunk_audio(audio, sr, chunk_s = chunk_length_s, overlap_s = overlap_s)

    full_chunks = []
    for chunk_samples, chunk_start in audio_chunks:
        sample = {"array": chunk_samples, "sampling_rate": sr}
        hf_out = asr_pipeline(sample)
        # shift timestamps by chunk_start
        if "chunks" in hf_out:
            for c in hf_out["chunks"]:
                if "words" in c:
                    for w in c["words"]:
                        ts = w.get("timestamp") or w.get("times") or w.get("start_end")
                        if isinstance(ts, (list, tuple)) and len(ts) >= 2:
                            # shift both start and end timestamps
                            w_ts0 = ts[0] + chunk_start
                            w_ts1 = ts[1] + chunk_start
                            w["timestamp"] = (w_ts0, w_ts1)
        hf_out["_chunk_start"] = chunk_start
        full_chunks.append(hf_out)

    combined = {"chunks": []}
    for out in full_chunks:
        if "chunks" in out:
            combined["chunks"].extend(out["chunks"])
        else:
            if "timestamps" in out:
                combined["chunks"].append ({
                        "timestamp": (out.get("_chunk_start", 0), None),
                        "words": out["timestamps"],
                    })
            else:
                combined["chunks"].append ({
                    "timestamp": (out.get("_chunk_start", 0), None),
                    "words": [{"word": out.get("text", "").strip()
                    }]}
                )

    transcript = format_verbatim_output(combined)
    return transcript

In [None]:
audio_file = r"data/audio/..."
output_txt = r"data/WER0/..."

print("Transcribing:", audio_file)
transcript_text = transcribe_audio(audio_file, asr_pipe, chunk_length_s = 30, overlap_s = 0.5)

# display first 50 lines
print("---- Transcript preview ----")
print("\n".join(transcript_text.splitlines()[:50]))
print("... (truncated)")

# save
with open(output_txt, "w", encoding="utf-8") as f:
    f.write(transcript_text)

print(f"Saved full transcript to: {output_txt}")