In [None]:
#| default_exp live

# üé§ LiveTranscriber ‚Äî Real-time Speech Transcription with VAD

This notebook implements a **live voice-to-text system** that:
- Listens to your microphone input in real-time  
- Uses **Silero VAD** (Voice Activity Detection) to detect when you're speaking  
- Automatically splits audio into *utterances* based on pauses  
- Transcribes each utterance using **Faster-Whisper** (OpenAI Whisper variant)  
- Streams the transcribed text chunks as they become available

It‚Äôs optimized for local apps (like TUIs or assistants) that need responsive, chunked speech transcription.


## üì¶ Imports and Device Selection

We import only the necessary libraries:
- `numpy`, `torch`, `pyaudio` for audio and model operations  
- `faster_whisper` for the actual speech-to-text model  
- `asyncio` for non-blocking background processing  
- `logging` for debug output  

We also define a small helper to pick the **best available compute device**:
- CUDA (GPU)
- MPS (Apple Silicon)
- or CPU (fallback)


In [None]:
#| export
import logging
import asyncio
from typing import Optional, Callable
from queue import Queue

import numpy as np
import pyaudio
import torch
from faster_whisper import WhisperModel

def get_device(force_cpu: bool = False) -> str:
    """Pick best available device."""
    if force_cpu:
        return "cpu"
    if torch.cuda.is_available():
        return "cuda"
    if torch.backends.mps.is_available():
        try:
            torch.mps.empty_cache()
        except Exception:
            pass
        return "mps"
    return "cpu"

def load_silero_vad():
    """Load Silero VAD model from torch hub."""
    try:
        model, _utils = torch.hub.load(
            repo_or_dir='snakers4/silero-vad',
            model='silero_vad',
            force_reload=False,
            onnx=False
        )
        return model
    except Exception as e:
        logging.warning(f"Failed to load Silero VAD: {e}")
        return None

  from .autonotebook import tqdm as notebook_tqdm


## üó£Ô∏è Silero VAD (Voice Activity Detection)

This small helper loads the **Silero VAD** model from `torch.hub`.

Silero VAD is a lightweight neural model that outputs the **probability of speech** (0‚Äì1).  
We‚Äôll use it to decide when the user is talking or has paused ‚Äî so we can send only meaningful speech chunks to Whisper.

If Silero fails to load (e.g., offline), we just return `None` and handle it later.


# üéôÔ∏è LiveTranscriber ‚Äî Real-time Speech-to-Text

The `LiveTranscriber` class captures live audio from your microphone, detects when you‚Äôre speaking using **Silero VAD**, and transcribes each spoken sentence using **Faster-Whisper** once you pause.

It runs asynchronously, making it ideal for real-time interfaces like TUIs or assistants.

---

### ‚öôÔ∏è Parameters
- **`model_id`** ‚Äì Whisper model to use (e.g. `"tiny"`, `"base"`, `"small"`).  
- **`language`** ‚Äì Language code for transcription (default `"en"`).  
- **`force_cpu`** ‚Äì Force CPU usage even if GPU is available.  
- **`on_transcript`** ‚Äì Callback called with each transcribed text chunk.  
- **`vad_threshold`** ‚Äì Silero confidence threshold (0.0‚Äì1.0, higher = stricter).  
- **`min_speech_duration_ms`** ‚Äì Minimum length of speech to count as valid.  
- **`min_silence_duration_ms`** ‚Äì How long silence must last before starting transcription.

---

### üß† Main Methods
- **`start()`** ‚Äî Begins microphone capture and transcription loop (async).  
- **`stop()`** ‚Äî Gracefully stops audio processing.  
- **`process_audio()`** ‚Äî Runs continuously, detecting speech/silence and triggering transcription.  
- **`_transcribe_chunk()`** ‚Äî Uses Whisper to transcribe one full utterance.  
- **`_detect_speech_silero()`** ‚Äî Returns `True` if Silero VAD detects speech in the current chunk.  

---

### üîÑ Summary
- Audio is streamed in 32 ms chunks (512 samples at 16 kHz).  
- Each chunk is passed to Silero VAD ‚Üí speech or silence.  
- When silence lasts long enough, the buffered audio is sent to Whisper.  
- The result is sent to your `on_transcript` callback.


In [None]:
#| export

class LiveTranscriber:
    """Live audio transcription for TUI applications using PyAudio and Whisper with Silero VAD-based chunking."""
    
    def __init__(
        self, 
        model_id: str = "openai/whisper-base",
        language: str = "en",
        force_cpu: bool = False,
        on_transcript: Optional[Callable[[str], None]] = None,
        vad_threshold: float = 0.5,
        min_speech_duration_ms: int = 250,
        min_silence_duration_ms: int = 500,
    ):
        
        self.logger = logging.getLogger(__name__)
        self.on_transcript = on_transcript
        
        self.model_id = model_id
        self.language = language

        # Fixed 16 kHz sample rate (required by Silero + Whisper)
        self.sample_rate = 16000
        
        # Device + ASR model
        self.device = get_device(force_cpu=force_cpu)
        self.transcribe_model = WhisperModel(
            self.model_id,
            device=self.device,
            compute_type="int8" if self.device == "cpu" else "float16", # use "float32" on MPS if needed
        )


        # Load Silero VAD
        self.vad_threshold = vad_threshold
        self.silero_model = load_silero_vad()
        if self.silero_model is None:
            raise RuntimeError("Silero VAD failed to load. Cannot continue.")

        # Thresholds (in samples)
        self.min_speech_samples = int(self.sample_rate * min_speech_duration_ms / 1000)
        self.min_silence_samples = int(self.sample_rate * min_silence_duration_ms / 1000)

        # Buffers and state
        self.audio_queue: "Queue[np.ndarray]" = Queue()
        self.is_running = False

        self.is_speech_active = False
        self.speech_buffer = np.array([], dtype=np.float32)
        self.silence_counter = 0

        self.logger.info(f"Initialized LiveTranscriber (model={model_id}, device={self.device}, sample_rate=16kHz, VAD=Silero)")
    
    def _detect_speech_silero(self, audio_chunk: np.ndarray) -> bool:
        """Return True if speech detected; False on low prob or on error."""
        try:
            audio_tensor = torch.from_numpy(audio_chunk).float()
            prob = self.silero_model(audio_tensor, self.sample_rate).item()
            return prob > self.vad_threshold
        except Exception as e:
            self.logger.warning(f"Silero VAD error: {e}")
            return False
    
    def _transcribe_chunk(self, audio_data: np.ndarray) -> str:
        segments, _info = self.transcribe_model.transcribe(
            audio_data,
            language=self.language,
            beam_size=1,
            condition_on_previous_text=False,
            vad_filter=True, # we already did VAD; set True if you want extra internal filtering
            vad_parameters=dict(
                threshold=0.4,
                min_speech_duration_ms=self.min_speech_samples * 1000 // self.sample_rate,
                max_speech_duration_s=float("inf"),
                min_silence_duration_ms=200,
            ),
        )
        return " ".join(s.text.strip() for s in segments).strip()
    
    def audio_callback(self, in_data, frame_count, time_info, status):
        """Called automatically by PyAudio for each audio frame."""
        if status:
            self.logger.debug(f"Audio callback status: {status}")
        audio = np.frombuffer(in_data, dtype=np.int16).astype(np.float32) / 32768.0
        self.audio_queue.put(audio)
        return in_data, pyaudio.paContinue
    
    async def process_audio(self):
        """Process queued audio in real-time with VAD chunking."""
        while self.is_running:
            if self.audio_queue.empty():
                await asyncio.sleep(0.01)
                continue

            chunk = self.audio_queue.get()
            if self._detect_speech_silero(chunk):
                if not self.is_speech_active:
                    self.is_speech_active = True
                    self.speech_buffer = chunk.copy()
                    self.silence_counter = 0
                else:
                    self.speech_buffer = np.append(self.speech_buffer, chunk)
                    self.silence_counter = 0
            else:
                if self.is_speech_active:
                    self.silence_counter += len(chunk)
                    self.speech_buffer = np.append(self.speech_buffer, chunk)

                    if self.silence_counter >= self.min_silence_samples:
                        if len(self.speech_buffer) >= self.min_speech_samples:
                            text = await asyncio.to_thread(self._transcribe_chunk, self.speech_buffer)
                            if text and self.on_transcript:
                                if asyncio.iscoroutinefunction(self.on_transcript):
                                    await self.on_transcript(text)
                                else:
                                    self.on_transcript(text)
                        # reset
                        self.is_speech_active = False
                        self.speech_buffer = np.array([], dtype=np.float32)
                        self.silence_counter = 0
    
    async def start(self):
        """Start recording and transcription loop."""
        self.is_running = True
        audio = pyaudio.PyAudio()
        try:
            stream = audio.open(
                format=pyaudio.paInt16,
                channels=1,
                rate=self.sample_rate,
                input=True,
                frames_per_buffer=512,
                stream_callback=self.audio_callback,
            )
            stream.start_stream()
            try:
                await self.process_audio()
            finally:
                stream.stop_stream()
                stream.close()
        finally:
            audio.terminate()

    def stop(self):
        self.is_running = False

## üß™ Live Transcription Test

This test starts a 10-second live recording session using the `LiveTranscriber`.  
Speak naturally in short sentences ‚Äî each pause will automatically trigger a transcription.  
Each transcribed chunk is printed as soon as it‚Äôs ready, and all results are shown at the end.


In [None]:
#| eval: false
import asyncio

all_chunks = []

def handle_transcript_chunk(text: str):
    """Callback called whenever a transcription chunk is ready."""
    if text.strip():
        print(f"\n[TRANSCRIBED] {text}")
        all_chunks.append(text)

async def test_live_transcription(duration_seconds: int = 10):
    print("üé§ Speak in short sentences; pauses will trigger transcription.")
    transcriber = LiveTranscriber(
        model_id="tiny",
        language="en",
        on_transcript=handle_transcript_chunk,
        vad_threshold=0.5,
        min_speech_duration_ms=250,
        min_silence_duration_ms=500,
    )
    task = asyncio.create_task(transcriber.start())
    try:
        await asyncio.sleep(duration_seconds)
    finally:
        transcriber.stop()
        await asyncio.sleep(0.3)
        task.cancel()
        try:
            await task
        except asyncio.CancelledError:
            pass

    print("\nüìù Full transcript:")
    for i, t in enumerate(all_chunks, 1):
        print(f"{i}. {t}")

await test_live_transcription(10)


üé§ Speak in short sentences; pauses will trigger transcription.


Using cache found in /home/jens/.cache/torch/hub/snakers4_silero-vad_master
ALSA lib pcm_dsnoop.c:567:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1000:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_dmix.c:1000:(snd_pcm_dmix_open) unable to open slave



[TRANSCRIBED] Hello everyone.

[TRANSCRIBED] Today I will be teaching you about something.

[TRANSCRIBED] I don't know what that is.

üìù Full transcript:
1. Hello everyone.
2. Today I will be teaching you about something.
3. I don't know what that is.
