# WhisperX


In [None]:
# Imports
import torch
import whisperx

: 

In [3]:
def transcribe_with_whisperx(audio_file, model_name="large-v2", language_code="pa"):
    """
    Transcribe and align an audio file using WhisperX.
    
    Args:
      audio_file (str): Path to your audio (e.g., .wav, .mp3).
      model_name (str): Whisper model variant, e.g., "tiny", "base", "medium", "large-v2".
      language_code (str): Language code for alignment model (e.g., "en", "hi", "es").
                          Check WhisperX docs for supported codes.
                          
    Returns:
      A dictionary containing segment-level and word-level alignment data.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # 1) Load the main Whisper model
    print(f"Loading Whisper model {model_name} on {device}...")
    model = whisperx.load_model(model_name, device=device)

    # 2) Transcribe the audio
    print(f"Transcribing {audio_file}...")
    audio = whisperx.load_audio(audio_file)
    # batch_size=16 is an example; adjust as needed for your GPU/CPU capabilities
    transcription_result = model.transcribe(audio, batch_size=16)

    # transcription_result is a dict with keys like ["text", "segments", "language"]
    # Each segment is {"start": float, "end": float, "text": str, ...}

    # 3) Load alignment model
    #    Make sure the language_code is supported by the alignment model in WhisperX
    print(f"Loading alignment model for language={language_code}...")
    alignment_model, metadata = whisperx.load_align_model(
        language_code=language_code,  # e.g., "hi" if you're aligning Hindi or similar
        device=device
    )

    # 4) Align the transcribed segments to get word-level timestamps
    print(f"Aligning results...")
    aligned_result = whisperx.align(
        transcription_result["segments"],  # the segments from transcription
        alignment_model,
        metadata,
        audio,
        device=device
    )
    # aligned_result is also a dict with "segments" that include "words" arrays

    return {
        "language": transcription_result["language"],
        "segments": aligned_result["segments"]
    }

: 

In [4]:

if __name__ == "__main__":
    AUDIO_FILE =  "/home/arjun/naren/subtitle_alignment/data/Punjabi/Audio/AniBook Videos/Abdul_Kalam,_Missile_Man_Punjabi.wav"
    # If trying Punjabi, you might pass language_code="hi" or "en" to see if alignment is possible.
    # Officially, WhisperX might not have a dedicated 'pa' alignment model yet.
    
    # For best results, pick a code that is closest to or partially supports your language. 
    # Or skip alignment if your language is not supported (only segment-level times).
    language_code_for_alignment = "hi"

    result = transcribe_with_whisperx(
        AUDIO_FILE, 
        model_name="tiny", 
        language_code=language_code_for_alignment
    )

    print("\n=== Transcription & Alignment Results ===")
    print(f"Detected language: {result['language']}")
    print("Segments (with word-level timestamps):")
    for seg in result["segments"]:
        seg_start = seg["start"]
        seg_end = seg["end"]
        seg_text = seg["text"]
        print(f"[{seg_start:.2f}s - {seg_end:.2f}s]: {seg_text}")

        # Word-level details
        if "words" in seg:
            for w in seg["words"]:
                w_start = w["start"]
                w_end = w["end"]
                w_text = w["word"]
                print(f"    -> {w_text} [{w_start:.2f}s - {w_end:.2f}s]")


Loading Whisper model tiny on cuda...


config.json:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/75.5M [00:00<?, ?B/s]

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../miniconda3/envs/alignment/lib/python3.10/site-packages/whisperx/assets/pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu124. Bad things might happen unless you revert torch to 1.x.
Transcribing /home/arjun/naren/subtitle_alignment/data/Punjabi/Audio/AniBook Videos/Abdul_Kalam,_Missile_Man_Punjabi.wav...


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

