In [1]:
# !pip install -q insanely-fast-whisper
# !pip install -q flash-attn --no-build-isolation
# !pip install gradio

In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from optimum.bettertransformer import BetterTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [4]:
model_id = "distil-whisper/distil-medium.en"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True# , use_flash_attention_2=True
)
model.to(device)
# model = model.to_bettertransformer() # we are using optimum BetterTransformer since Flash Attention 2 isn't supported on Colab
processor = AutoProcessor.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=15, #long form transcription
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
)

In [9]:
import gradio as gr
from transformers import pipeline
import numpy as np

def transcribe(filepath):
    output = pipe(
        filepath,
    )
    return output["text"]


import gradio as gr

demo = gr.Interface(
    title='My Audio Transcription App Powered by Distill Whisper',
    description="Start recording",
    fn=transcribe,
    inputs=gr.Audio(sources=["upload", "microphone"], type="filepath"),
    outputs= "text",
)

demo.launch(debug=True)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.




In [None]:
import gradio as gr
from transformers import pipeline
import numpy as np


def transcribe(filepath):
    output = pipe(
        filepath,
    )
    return output["text"]


def transcribe_streaming(stream, new_chunk):
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y
    return stream, pipe({"sampling_rate": sr, "raw": stream})["text"]


import gradio as gr

demo = gr.Blocks()

mic_transcribe = gr.Interface(
    title='My Audio Transcription App Powered by Distill Whisper',
    description="Start recording",
    fn=transcribe_streaming,
    inputs=["state", gr.Audio(sources="microphone", streaming=True)],
    outputs=["state", "text"],
    live=True,
)


file_transcribe = gr.Interface(
    title='My Audio Transcription App Powered by Distill Whisper',
    description="Upload an audio file",
    fn=transcribe,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Textbox(),
)


gr.close_all()

with demo:
    gr.TabbedInterface(
        [file_transcribe, mic_transcribe],
        [ "Transcribe Audio File", "Transcribe Microphone"],
    )

demo.launch(debug=True)


Closing server running on port: 7860
Closing server running on port: 7860
Closing server running on port: 7860
Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


