In [1]:
import whisper

In [2]:
model = whisper.load_model("base")
model

100%|███████████████████████████████████████| 139M/139M [00:12<00:00, 12.0MiB/s]


Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-5): 6 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=512, out_features=512, bias=True)
          (key): Linear(in_features=512, out_features=512, bias=False)
          (value): Linear(in_features=512, out_features=512, bias=True)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (attn_ln): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (mlp_ln): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((512,), eps=1e-05,

In [None]:
import whisper
import sounddevice as sd
import numpy as np
import queue
import threading

# Constants
SAMPLE_RATE = 16000
BLOCK_SIZE = 4000  # ~0.25s of audio

# Load Whisper model (base/small/medium/large)
model = whisper.load_model("base")

# Queue to hold audio blocks
q = queue.Queue()

# Audio callback
def callback(indata, frames, time, status):
    if status:
        print(status)
    q.put(indata.copy())

# Transcribe audio chunks
def transcribe():
    print("Listening...")
    buffer = np.zeros((0, 1), dtype=np.float32)

    while True:
        audio_chunk = q.get()
        buffer = np.concatenate((buffer, audio_chunk))

        # If buffer is larger than 5 seconds, transcribe the latest 5 seconds
        if len(buffer) >= SAMPLE_RATE * 5:
            segment = buffer[-SAMPLE_RATE*5:]  # Last 5 seconds
            audio_np = np.squeeze(segment)

            # Run Whisper
            result = model.transcribe(audio_np, fp16=False, language='en')
            print("➤", result['text'].strip())

# Start audio stream
stream = sd.InputStream(callback=callback, channels=1, samplerate=SAMPLE_RATE, blocksize=BLOCK_SIZE)
with stream:
    transcribe()


In [36]:
import gc
gc.collect()

3537

In [35]:
import whisper
import numpy as np
import time
from pydub import AudioSegment

# Load Whisper model
model = whisper.load_model("base")

# Load full MP3 and convert to mono 16kHz
audio = AudioSegment.from_file("testing_medium.mp3")
audio = audio.set_channels(1).set_frame_rate(16000)

chunk_duration_ms = 30000  # 5 seconds
num_chunks = len(audio) // chunk_duration_ms

print(f"🔊 Total audio length: {len(audio)/1000:.2f} seconds")
print("🎧 Streaming...")

for i in range(num_chunks):
    chunk = audio[i * chunk_duration_ms: (i + 1) * chunk_duration_ms]
    
    # Convert to numpy float32 array
    samples = np.array(chunk.get_array_of_samples()).astype(np.float32) / 32768.0

    # Transcribe using Whisper
    result = model.transcribe(samples, fp16=False, language="en")
    print(f"🕒 Chunk {i+1}: {result['text'].strip()}")

    time.sleep(5)  # simulate real-time delay


🔊 Total audio length: 740.46 seconds
🎧 Streaming...
🕒 Chunk 1: A few years ago, I broke into my own house. I had just driven home. It was around midnight in the dead of Montreal winner. I'd been visiting my friend Jeff across town. And the thermometer on the front porch read minus 40 degrees. And don't bother asking if that sells...
🕒 Chunk 2: use her Fahrenheit minus 40 is where the two scales meet. It was very cold. And as I stood on the front porch fumbling in my pockets, I found I didn't have my keys. In fact, I could see them through the window, lying on the dining room table where I had left them. So I quickly ran around and tried all the other doors and windows and they were locked tight. I thought about calling a locksmith, at least I had my cell phone, but at midnight, it could take a while for a locksmith to show up, and it was cold.
🕒 Chunk 3: I couldn't go back to my friend Jeff's house for the night because I had an early flight to Europe the next morning and I needed to g

In [32]:
import whisper
model = whisper.load_model("base")

# Pass file path
result = model.transcribe("testing.mp3", language="en")
print("📝 Transcript:", result['text'])

📝 Transcript:  If you want to change the world, start off by making your bed. If you make your bed every morning, you will have accomplished the first task of the day. It will give you a small, since a pride, and it will encourage you to do another task, and another, and another. And by the end of the day, that one task completed will have turned into many tasks completed. And your bed will also reinforce the fact that the little things in life matter. If you can't do the little things right, you'll never be able to do the big things right. And if by chance you have a miserable day, you will come home to a bed that is made. That you made, and a made bed gives you encouragement that tomorrow will be better. I've been a Navy SEAL for 36 years. Every morning in SEAL training, my instructors, who at the time were all Vietnam veterans, would show up in my barritch room, and the first thing they'd do was inspect my bed. If you did it right, the corners would be square, the covers would be pu

KeyboardInterrupt: 

In [31]:
import gc
gc.collect()

6135

In [None]:
import whisper

model = whisper.load_model("base")
audio = whisper.load_audio("testing.mp3")
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)


Detected language: en
If you want to change the world, start off by making your bed. If you make your bed every morning, you will have accomplished the first task of the day. It will give you a small, since a pride, and it will encourage you to do another task, and another, and another. And by the end of the day, that one task completed will have turned into many tasks completed.


In [13]:
import whisper

model = whisper.load_model("base")
audio = whisper.load_audio("testing_medium.mp3")
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)


Detected language: en
A few years ago, I broke into my own house. I had just driven home. It was around midnight in the dead of Montreal winner. I'd been visiting my friend Jeff across town. And the thermometer


In [14]:
import whisper

model = whisper.load_model("base")
audio = whisper.load_audio("testing_large.mp3")
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)


Detected language: en
Change the world. It's 24. It's an honor to present to you the one and only, the goat, the legend, Graham Weaver.


In [21]:
audio = whisper.load_audio("testing_large.mp3")
len(audio)

32106324

In [22]:
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)

AssertionError: incorrect audio shape

In [20]:
len(audio)

480000

In [16]:
result.text

"Change the world. It's 24. It's an honor to present to you the one and only, the goat, the legend, Graham Weaver."