# Wav2Vec2 Automatic Transcription

In [None]:
import gradio as gr
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

The `Wav2Vec2Processor` can be used for tokenization as well as feature extraction depending on the `__call__`. More info [here](https://huggingface.co/transformers/model_doc/wav2vec2.html#transformers.Wav2Vec2Processor.__call__)

In [None]:
# load the models and their processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")

In [None]:
# define speech-to-text function for wav2vec2 
def asr_transcript(audio_file):
    transcript = ""

    # Stream over 20 seconds chunks
    stream = librosa.stream(
        audio_file, block_length=20, frame_length=16000, hop_length=16000
    )

    for speech in stream:
        if len(speech.shape) > 1:
            speech = speech[:, 0] + speech[:, 1]

        input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
        with torch.no_grad():
            logits = model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]
        transcript += transcription.lower() + ". "

    return transcript


In [None]:
gradio_ui = gr.Interface(
    fn=asr_transcript,
    title="Automatic Transcription with Wav2Vec2",
    description="Upload a flac/wav audio with a sampling rate of 16kHz",
    inputs=gr.inputs.Audio(label="Upload Audio File", type="filepath"),
    outputs=gr.outputs.Textbox(label="Automatically Transcribed by facebook/wav2vec2-large-robust-ft-libri-960h")
)
gradio_ui.launch()