In [1]:
import os
import whisper
import torchaudio
import torch
from whisper.audio import log_mel_spectrogram, pad_or_trim

# Check if GPU is available 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


In [2]:
custom_model_dir = "./models/whisper"

# Load Whisper medium model
model = whisper.load_model("medium", device=device, download_root=custom_model_dir)

# Directory containing the audio files
audio_root = "amicorpus"

In [3]:
allocated = torch.cuda.memory_allocated() / 1024**2
reserved = torch.cuda.memory_reserved() / 1024**2
print(f"{allocated:.2f} MB allocated, {reserved:.2f} MB reserved")

2914.90 MB allocated, 4378.00 MB reserved


In [4]:
def transcribe_audio(audio_path, chunk_duration_sec=30):
    """
    Transcribe long audio using OpenAI Whisper with manual chunking (GPU-accelerated).
    """
    # Load and resample audio
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample to 16000 Hz
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    waveform = waveform.squeeze()  # mono
    total_samples = waveform.shape[0]
    chunk_size = int(16000 * chunk_duration_sec)

    transcriptions = []

    for start in range(0, total_samples, chunk_size):
        end = min(start + chunk_size, total_samples)
        chunk = waveform[start:end]

        # Pad or trim to fit model's input expectations
        chunk = pad_or_trim(chunk, length=chunk_size)

        # Convert to log-Mel spectrogram (as Whisper expects)
        mel = log_mel_spectrogram(chunk).to(model.device)

        # Decode
        options = whisper.DecodingOptions(language="en", fp16=True)
        result = whisper.decode(model, mel, options)

        if isinstance(result, list):
            text = " ".join(r.text.strip() for r in result)
        else:
            text = result.text.strip()

        transcriptions.append(text)
        torch.cuda.empty_cache()

    return " ".join(transcriptions)

In [5]:
# Walk through all audio files
for root, dirs, files in os.walk(audio_root):
    for file in files:
        if file.endswith(".wav"):
            audio_path = os.path.join(root, file)
            txt_path = os.path.splitext(audio_path)[0] + ".txt"

            # Skip if transcript already exists
            if os.path.exists(txt_path):
                print(f"Skipping {audio_path} — transcript already exists.")
                continue

            print(f"Transcribing {audio_path}...")
            try:
                transcript = transcribe_audio(audio_path)
                with open(txt_path, "w", encoding="utf-8") as f:
                    f.write(transcript)
                print(f"Saved transcript to {txt_path}")
            except Exception as e:
                print(f"Failed to process {audio_path}: {e}")

Skipping amicorpus\ES2009a\ES2009a.Mix-Headset.wav — transcript already exists.
Skipping amicorpus\ES2009b\ES2009b.Mix-Headset.wav — transcript already exists.
Skipping amicorpus\ES2009c\ES2009c.Mix-Headset.wav — transcript already exists.
Skipping amicorpus\ES2009d\ES2009d.Mix-Headset.wav — transcript already exists.
Skipping amicorpus\ES2010a\ES2010a.Mix-Headset.wav — transcript already exists.
Skipping amicorpus\ES2010b\ES2010b.Mix-Headset.wav — transcript already exists.
Skipping amicorpus\ES2010c\ES2010c.Mix-Headset.wav — transcript already exists.
Transcribing amicorpus\ES2010d\ES2010d.Mix-Headset.wav...
Saved transcript to amicorpus\ES2010d\ES2010d.Mix-Headset.txt
Skipping amicorpus\ES2011a\ES2011a.Mix-Headset.wav — transcript already exists.
Skipping amicorpus\ES2011b\ES2011b.Mix-Headset.wav — transcript already exists.
Skipping amicorpus\ES2011c\ES2011c.Mix-Headset.wav — transcript already exists.
Skipping amicorpus\ES2011d\ES2011d.Mix-Headset.wav — transcript already exists.
