<a href="https://colab.research.google.com/github/Vaibhavs10/optimise-my-whisper/blob/main/insanely_fast_whisper_fp16_sdpa_spec_dec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Insanely Fast Whisper: A journey to build the fastest possible transcription with Whisper 🔥

By: [Vaibhav (VB) Srivastav](https://twitter.com/reach_vb)

## fp16 + SDPA + Speculative Decoding

fp16 + SDPA + Speculative Decoding for faster inference.

### Setup our inference environment 🧑‍💻

In [1]:
!pip install -q --upgrade transformers accelerate torch ipython-autotime


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Setting up the utilities to track time taken by each step ⏳

In [2]:
%load_ext autotime


time: 0 ns (started: 2024-11-27 17:15:05 +05:30)


### Necessary imports 🔧




In [3]:
import torch

time: 2.14 s (started: 2024-11-27 17:15:05 +05:30)


In [4]:
from transformers import AutoModelForSpeechSeq2Seq, AutoModelForCausalLM, AutoProcessor, pipeline

  from .autonotebook import tqdm as notebook_tqdm


time: 5.94 s (started: 2024-11-27 17:15:07 +05:30)


### Define Model checkpoint, device and datatype 🔉

In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(device,torch_dtype)

cuda:0 torch.float16
time: 31 ms (started: 2024-11-27 17:15:13 +05:30)


### Setup the Assistant model 🐐

In [None]:
assistant_model_id = "distil-whisper/distil-large-v3"

assistant_model = AutoModelForCausalLM.from_pretrained(
    assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa"
)
assistant_model.to(device)

### Setup the primary model 💎

In [None]:
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa"
)
model.to(device)

### Load the model and initialise the speech recognition pipeline ⚡

In [None]:
processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    generate_kwargs={"assistant_model": assistant_model},
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    torch_dtype=torch_dtype,
    device=device,
)

### Define an audio sample to test on 👇

In [None]:
sample = "test 2.mp3"

### Transcribe away! 💪

In [None]:
result = pipe(sample)

In [None]:
print(result["text"])

In [None]:
import whisper

model = whisper.load_model("large")

In [None]:
transcription = model.transcribe("test 2.mp3")

print(transcription["text"])

In [None]:
import wave
import pyaudio
import os
from colorama import Fore, Style
import whisper
import torch

print(f"CUDA Available: {torch.cuda.is_available()}")

NEON_GREEN = Fore.GREEN + Style.BRIGHT
RESET_COLOR = Style.RESET_ALL

chunk_length = 2
sample_rate = 48000
def transcribe_chunk(model, chunk_file):
    # Transcribe audio chunk and return text
    result = model.transcribe(chunk_file)
    transcription = result["text"]
    return transcription

def record_chunk(p, stream, file_path, chunk_length=1):
    frames = [] 
    for _ in range(0, int(sample_rate / 1024 * chunk_length)):
        try:
            data = stream.read(1024)
            frames.append(data)
        except Exception as e:
            print(f"Error reading audio stream: {e}")
            return
    wf = wave.open(file_path, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
    wf.setframerate(sample_rate)
    wf.writeframes(b''.join(frames))
    wf.close()

def main():
    # Choose your model settings
    model_size = "medium.en"
    model = whisper.load_model(model_size, device="cuda" if torch.cuda.is_available() else "cpu")
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    accumulated_transcription = ""  # Initialize an empty string to accumulate transcriptions
    try:
        while True:
            chunk_file = "temp_chunk.wav"
            record_chunk(p, stream, chunk_file,chunk_length)
            if os.path.exists(chunk_file):
                transcription = transcribe_chunk(model, chunk_file)
                print(NEON_GREEN + transcription + RESET_COLOR)
                os.remove(chunk_file)
                # Append the new transcription to the accumulated transcription
                accumulated_transcription += transcription + " "
    except KeyboardInterrupt:
        print("Stopping...")
        # Write the accumulated transcription to the log file
        with open("log.txt", "a") as log_file:
            log_file.write(accumulated_transcription)
    finally:
        print("LOG: " + accumulated_transcription)
        stream.stop_stream()
        stream.close()
        p.terminate()

In [None]:
!pip3 install faster-whisper