In [None]:
import torch
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

# Load pre-trained model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

def transcribe_wav2vec(audio_path):
    # Load audio
    audio, rate = librosa.load(audio_path, sr=16000)
    input_values = tokenizer(audio, return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)[0]
    print("Transcription:", transcription.lower())
    return transcription.lower()

# Example usage
transcribe_wav2vec("harvard.wav")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Transcription: the stale smell of old beer lingers it takes heat to bring out the odor a cold dip restores health and zest a salt pickle tastes fine with ham takos al pastore are my favorite a zestful food is the hot cross bun


'the stale smell of old beer lingers it takes heat to bring out the odor a cold dip restores health and zest a salt pickle tastes fine with ham takos al pastore are my favorite a zestful food is the hot cross bun'

In [2]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.30.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [4]:
# Let's bring in the libraries we need for audio processing and the UI
import torch
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import gradio as gr

# Load the pre-trained Wav2Vec2 model and tokenizer from Hugging Face
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Function to transcribe audio to text using Wav2Vec2
def transcribe_wav2vec(audio_input, progress=gr.Progress()):
    # Check if we got an audio input
    if audio_input is None:
        return None, "Please upload an audio file or record something."

    try:
        # Show a progress message to let the user know we're working
        progress(0.1, desc="Processing audio...")

        # Load the audio file and ensure it's at 16kHz sample rate
        audio, rate = librosa.load(audio_input, sr=16000)

        # Convert audio to input tensors for the model
        input_values = tokenizer(audio, return_tensors="pt", padding="longest").input_values

        # Run the model without gradient calculations for efficiency
        with torch.no_grad():
            logits = model(input_values).logits

        # Get the most likely transcription from the model's output
        predicted_ids = torch.argmax(logits, dim=-1)

        # Decode the IDs into human-readable text
        transcription = tokenizer.batch_decode(predicted_ids)[0]

        # Return the transcription in lowercase for consistency
        return transcription.lower(), None

    except Exception as e:
        # If something goes wrong, let the user know
        return None, f"Oops, something went wrong: {str(e)}"

# Function to provide a sample audio path for testing
def load_sample_audio():
    # Return a placeholder path (replace with actual path if you have a sample)
    return "harvard.wav"  # Replace with a real sample audio file path if available

# Custom CSS for a clean, professional, and aesthetic look
custom_css = """
body {
    font-family: 'Arial', sans-serif;
}
.gr-button {
    border-radius: 12px !important;
    padding: 12px 24px !important;
    font-weight: 600 !important;
    transition: all 0.3s ease !important;
}
.gr-button:hover {
    opacity: 0.9 !important;
    transform: scale(1.02) !important;
}
.gr-textbox, .gr-audio {
    border-radius: 12px !important;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1) !important;
    padding: 15px !important;
}
.card {
    background: #ffffff !important;
    border-radius: 16px !important;
    box-shadow: 0 6px 12px rgba(0, 0, 0, 0.15) !important;
    padding: 20px !important;
    margin-bottom: 20px !important;
}
.header {

    color: white !important;
    padding: 20px !important;
    border-radius: 12px !important;
    text-align: center !important;
}
"""

# Build the Gradio interface with a modern, user-friendly design
with gr.Blocks(theme=gr.themes.Monochrome(), css=custom_css) as interface:
    # Add a stylish header to welcome users
    gr.Markdown(
        """
        <div class="header">
            <h1>🎙️ Speech-Recognization-System</h1>
            <p>Upload an audio file or record your voice to get an accurate transcription using Wav2Vec2 AI.</p>
        </div>
        """
    )

    with gr.Row():
        # Input section for audio
        with gr.Column(scale=3):
            with gr.Group(elem_classes="card"):
                gr.Markdown("### 🎵 Audio Input")
                audio_input = gr.Audio(
                    label="Upload or Record Audio",
                    sources=["upload", "microphone"],
                    type="filepath"
                )
                with gr.Row():
                    sample_button = gr.Button("Load Sample Audio", variant="secondary")
                    clear_button = gr.Button("Clear", variant="secondary")

        # Output section for transcription
        with gr.Column(scale=2):
            with gr.Group(elem_classes="card"):
                gr.Markdown("### 📜 Transcription")
                transcription_output = gr.Textbox(
                    label="Transcribed Text",
                    placeholder="Your transcription will appear here...",
                    lines=8,
                    max_lines=12,
                    interactive=False
                )
                error_output = gr.Textbox(
                    label="Status",
                    placeholder="Status messages will appear here...",
                    interactive=False,
                    visible=False
                )

    # Summarize button outside the cards for prominence
    transcribe_button = gr.Button("Transcribe", variant="primary")

    # Connect buttons to functions
    transcribe_button.click(
        fn=transcribe_wav2vec,
        inputs=[audio_input],
        outputs=[transcription_output, error_output]
    )
    sample_button.click(
        fn=load_sample_audio,
        inputs=None,
        outputs=[audio_input]
    )
    clear_button.click(
        fn=lambda: (None, None, None),
        inputs=None,
        outputs=[audio_input, transcription_output, error_output]
    )

# Launch the Gradio interface
if __name__ == "__main__":
    interface.launch()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7dc67f52bc8cd9c989.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
