In [1]:
from huggingface_hub import login
import os

# Retrieve the token from Colab secrets
hf_token = os.getenv("HF_TOKEN")

# Log in to Hugging Face
login(token=hf_token)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import torch
import gradio as gr
import soundfile as sf
import numpy as np
import librosa
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor
)

In [5]:
def load_whisper_model(model_name="openai/whisper-base"):
    try:
        # Determine device
        device = "cuda:0" if torch.cuda.is_available() else "cpu"

        # Load model and processor
        model = WhisperForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.float32,  # Force float32 to avoid dtype issues
            low_cpu_mem_usage=True
        )
        processor = WhisperProcessor.from_pretrained(model_name)

        # Move model to device
        model.to(device)

        return model, processor, device

    except Exception as e:
        print(f"Model loading error: {e}")
        return None, None, None

In [7]:
# Global model loading
MODEL_NAME = "openai/whisper-base"
MODEL, PROCESSOR, DEVICE = load_whisper_model(MODEL_NAME)

In [8]:
def prepare_audio(audio_path):
    try:
        # Use librosa to load and resample audio
        waveform, original_sample_rate = librosa.load(audio_path, sr=None)

        # Resample to 16kHz if necessary
        if original_sample_rate != 16000:
            waveform = librosa.resample(
                waveform,
                orig_sr=original_sample_rate,
                target_sr=16000
            )

        # Ensure mono audio
        if waveform.ndim > 1:
            waveform = waveform.mean(axis=1)

        print(f"Audio prepared - Sample Rate: 16000, Length: {len(waveform)}")
        return waveform, 16000

    except Exception as e:
        print(f"Audio preparation error: {e}")
        return None, None

In [9]:
def transcribe_audio(audio_path):
    try:
        # Prepare audio with guaranteed 16kHz sampling rate
        waveform, sample_rate = prepare_audio(audio_path)

        if waveform is None:
            return "Failed to load audio file"

        # Prepare inputs with explicit dtype handling
        inputs = PROCESSOR(
            waveform,
            sampling_rate=sample_rate,
            return_tensors="pt"
        )

        # Ensure inputs are on the correct device and dtype
        input_features = inputs.input_features.to(DEVICE, dtype=torch.float32)

        # Generate transcription
        generated_ids = MODEL.generate(input_features)

        # Decode the transcription
        transcription = PROCESSOR.batch_decode(
            generated_ids,
            skip_special_tokens=True
        )[0]

        return transcription

    except Exception as e:
        print(f"Transcription error: {e}")
        return f"Error during transcription: {str(e)}"

In [10]:
def process_audio(audio):
    try:
        # Handle different input types
        if audio is None:
            return "No audio file provided"

        # If audio is a numpy array, save to file
        if isinstance(audio, np.ndarray):
            temp_audio_path = "temp_audio.wav"
            # Resample to 16kHz if necessary
            if audio.size > 0:
                # Determine original sample rate (assume it might be different)
                original_sr = 44100  # Common sample rate, can be adjusted
                resampled_audio = librosa.resample(
                    audio,
                    orig_sr=original_sr,
                    target_sr=16000
                )
                sf.write(temp_audio_path, resampled_audio, 16000)
            else:
                return "Empty audio input"
            audio = temp_audio_path

        # Transcribe the audio
        transcription = transcribe_audio(audio)

        return transcription

    except Exception as e:
        return f"Processing error: {str(e)}"

In [11]:
def create_gradio_interface():
    interface = gr.Interface(
        fn=process_audio,
        inputs=[
            gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Upload or Record Audio"
            )
        ],
        outputs=[
            gr.Textbox(
                label="Transcription",
                lines=5
            )
        ],
        title="🎙️ Whisper Transcription",
        description="Upload an audio file or record audio to transcribe",
        examples=[
            ["path/to/sample1.mp3"],
            ["path/to/sample2.wav"]
        ]
    )

    return interface

In [12]:
def main():
    # Check if model is loaded successfully
    if MODEL is None or PROCESSOR is None:
        print("Failed to load Whisper model. Please check your setup.")
        return

    # Create and launch interface
    interface = create_gradio_interface()
    interface.launch(debug=True)

# Installation Requirements
# !pip install -U transformers torch torchaudio gradio soundfile librosa

# Run the main function
if __name__ == "__main__":
    main()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://edb8e6347931509212.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Audio prepared - Sample Rate: 16000, Length: 176128


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Audio prepared - Sample Rate: 16000, Length: 159744
Audio prepared - Sample Rate: 16000, Length: 175211
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://edb8e6347931509212.gradio.live
