In [3]:
from moviepy.audio.io.AudioFileClip import AudioFileClip
import uuid
from pathlib import Path

In [None]:
packages = """
accelerate==1.81,
moviepy==2.2.1,
torch==2.7.1,
torchaudio==2.7.1,
transformers==4.53.0,
pydantic==2.10.6,
uvicorn==0.34.0,
fastapi==0.115.6,
pydantic-settings==2.9.0,
fastapi==0.115.14,
pydantic==2.11.7,
uvicorn==0.35.0
"""


In [6]:
!uv pip show torch torchaudio transformers accelerate moviepy pydantic uvicorn fastapi

Name: accelerate
Version: 1.8.1
Location: /home/azureuser/whisper/.venv/lib/python3.10/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by:
---
Name: fastapi
Version: 0.115.14
Location: /home/azureuser/whisper/.venv/lib/python3.10/site-packages
Requires: pydantic, starlette, typing-extensions
Required-by:
---
Name: moviepy
Version: 2.2.1
Location: /home/azureuser/whisper/.venv/lib/python3.10/site-packages
Requires: decorator, imageio, imageio-ffmpeg, numpy, pillow, proglog, python-dotenv
Required-by:
---
Name: pydantic
Version: 2.11.7
Location: /home/azureuser/whisper/.venv/lib/python3.10/site-packages
Requires: annotated-types, pydantic-core, typing-extensions, typing-inspection
Required-by: fastapi
---
Name: torch
Version: 2.7.1
Location: /home/azureuser/whisper/.venv/lib/python3.10/site-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu

In [52]:
audio = AudioFileClip("test_audio.mp4")
out_file = f"test_audio-{uuid.uuid4()}.mp3"
audio.write_audiofile(out_file)
with open(out_file, "rb") as f:
    audio_bts = f.read()
Path(out_file).unlink()



MoviePy - Writing audio in test_audio-e011ebfb-4bd7-4735-bec5-338a17ab1c12.mp3


chunk:   0%|                                                             | 0/706 [00:00<?, ?it/s, now=None]

                                                                                                           

MoviePy - Done.




In [25]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    return_timestamps=True
)

Device set to use cuda:0


In [37]:
import io
import torchaudio

In [38]:
# Function to process bytes
def bts_to_np(audio_bytes: bytes):
    # Convert bytes to waveform using torchaudio
    with io.BytesIO(audio_bytes) as audio_file:
        waveform, sample_rate = torchaudio.load(audio_file)

    # Whisper expects 16kHz audio
    if sample_rate != 16000:
        waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=16000)

    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Convert waveform to numpy array
    waveform_np = waveform.squeeze().numpy()
    return waveform_np

In [54]:
waveform_np = bts_to_np(audio_bts)

In [55]:
result = pipe(waveform_np)



In [57]:
transcription = result["text"]

In [58]:
transcription

' Then we need to create a ScreenPipe MCP server so we can query the transcription. And we need to try out the GitHub MCP server for creating tickets. And lastly, we need to try to run the end-to-end flow. Thank you.'

In [64]:
result = pipe("test_audio.mp3")



In [65]:
result

{'text': ' Then we need to create a ScreenPipe MCP server so we can query the transcription. And we need to try out the GitHub MCP server for creating tickets. And lastly, we need to try to run the end-to-end flow. Thank you.', 'chunks': [{'timestamp': (0.0, 10.0), 'text': ' Then we need to create a ScreenPipe MCP server so we can query the transcription.'}, {'timestamp': (10.0, 18.0), 'text': ' And we need to try out the GitHub MCP server for creating tickets.'}, {'timestamp': (18.0, 23.0), 'text': ' And lastly, we need to try to run the end-to-end flow.'}, {'timestamp': (30.0, 59.98), 'text': ' Thank you.'}]}

In [72]:
x = result["text"]

In [73]:
x

' Then we need to create a ScreenPipe MCP server so we can query the transcription. And we need to try out the GitHub MCP server for creating tickets. And lastly, we need to try to run the end-to-end flow. Thank you.'

In [76]:
transcription = result["text"].strip()

In [77]:
transcription

'Then we need to create a ScreenPipe MCP server so we can query the transcription. And we need to try out the GitHub MCP server for creating tickets. And lastly, we need to try to run the end-to-end flow. Thank you.'

In [None]:


dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample)
print(result["text"])
