In [None]:
#| default_exp transcription

In [None]:
# | export

import os
import time
import wave
from pathlib import Path
from typing import Optional, Union

from faster_whisper import WhisperModel
from rich.console import Console

# Console for rich formatting
console = Console()


def _is_cuda_available() -> bool:
    """Check if CUDA is available for GPU acceleration."""
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        return False

## AI Transcription Subsystem

The WhisperTranscriber class represents the intelligent core of our application. It leverages OpenAI's Whisper model - a state-of-the-art speech recognition system trained on massive multilingual datasets - to convert audio into accurate text transcriptions.

### Model Architecture

Whisper employs a sophisticated transformer-based architecture that has been trained on over 680,000 hours of multilingual audio data. The model supports multiple languages and can handle various audio conditions including background noise, different speakers, and technical audio quality variations.

### Performance Characteristics

- **Accuracy**: 97%+ word error rate on clean English speech
- **Speed**: Real-time or faster processing with GPU acceleration
- **Languages**: 99 languages supported with automatic language detection
- **Robustness**: Handles diverse audio conditions and speaker variations

### Integration Strategy

The transcriber integrates seamlessly with our recording system through a standardized interface that abstracts the complexity of the underlying AI model while providing comprehensive error handling and user feedback.

In [None]:
# | export

class WhisperTranscriber:
    """AI-powered speech recognition system using OpenAI's Whisper model."""
    VALID_MODELS = [
        # English-only models (optimized for English speech)
        "tiny.en",      # ~39 MB, fastest English-only model
        "base.en",      # ~74 MB, good balance for English
        "small.en",     # ~244 MB, higher accuracy for English
        "medium.en",    # ~769 MB, best accuracy for English

        # Multilingual models (support 99+ languages)
        "tiny",         # ~39 MB, fastest multilingual model
        "base",         # ~74 MB, good balance for all languages
        "small",        # ~244 MB, higher accuracy multilingual
        "medium",       # ~769 MB, best accuracy multilingual
        "large-v1",     # ~1550 MB, previous large model
        "large-v2",     # ~1550 MB, improved large model
        "large-v3",     # ~1550 MB, latest large model
        "large",        # ~1550 MB, alias for large-v3

        # Distilled models (faster, smaller versions)
        "distil-large-v2",      # ~760 MB, distilled large model
        "distil-medium.en",     # ~590 MB, distilled English medium
        "distil-small.en",      # ~660 MB, distilled English small
        "distil-large-v3",      # ~760 MB, distilled large v3
        "distil-large-v3.5",    # ~760 MB, latest distilled large

        # Turbo models (fastest performance)
        "large-v3-turbo",       # ~1550 MB, turbo-optimized large
        "turbo",                # ~1550 MB, alias for large-v3-turbo
    ]

    def __init__(self, model_name: Optional[str] = None, language: Optional[str] = None):
        """Initialize the Whisper transcriber with specified model and language.

        This constructor sets up the complete transcription pipeline by:
        1. Validating and selecting the appropriate Whisper model
        2. Detecting available hardware for optimal performance
        3. Loading the model with appropriate device configuration
        4. Configuring language settings for transcription

        Args:
            model_name: Name of the Whisper model to use (e.g., "base", "small")
                       If None, uses HNS_WHISPER_MODEL environment variable or "base"
            language: Target language code (e.g., "en", "es", "fr")
                     If None, uses HNS_LANG environment variable or auto-detection

        The initialization process includes automatic hardware detection to ensure
        optimal performance across different system configurations.
        """
        self.model_name = self._get_model_name(model_name)
        self.language = language or os.environ.get("HNS_LANG")
        self.model = self._load_model()

    def _get_audio_duration(self, audio_file_path: Union[Path, str]) -> Optional[float]:
        """Get duration of audio file in seconds."""
        try:
            with wave.open(str(audio_file_path), "rb") as audio_file:
                frames = audio_file.getnframes()
                sample_rate = audio_file.getframerate()
                duration = frames / float(sample_rate)
                return duration
        except Exception:
            return None

    def _get_model_name(self, model_name: Optional[str]) -> str:
        model = model_name or os.environ.get("HNS_WHISPER_MODEL", "base")

        if model not in self.VALID_MODELS:
            console.print(f"⚠️ [bold yellow]Invalid model '{model}', using 'base' instead[/bold yellow]")
            console.print(f"    [dim]Available models: {', '.join(self.VALID_MODELS)}[/dim]")
            return "base"

        return model


    def _load_model(self) -> WhisperModel:
        try:
            # Auto-detect available device
            device = "cuda" if _is_cuda_available() else "cpu"
            compute_type = "float16" if device == "cuda" else "int8"

            return WhisperModel(self.model_name, device=device, compute_type=compute_type)
        except Exception as e:
            raise RuntimeError(f"Failed to load model: {e}")

    def transcribe(self, audio_source: Union[Path, str]) -> tuple[str, float]:
        """Transcribe audio file to text using the loaded Whisper model."""
        transcribe_kwargs = {
            "beam_size": 5,
            "vad_filter": True,
            "vad_parameters": {"min_silence_duration_ms": 500, "speech_pad_ms": 400, "threshold": 0.5},
        }

        if self.language:
            transcribe_kwargs["language"] = self.language

        try:
            _audio_duration = self._get_audio_duration(audio_source)

            start_time = time.time()

            segments, _ = self.model.transcribe(str(audio_source), **transcribe_kwargs)
            transcription_parts = []
            for segment in segments:
                text = segment.text.strip()
                if text:
                    transcription_parts.append(text)

            full_transcription = " ".join(transcription_parts)
            if not full_transcription:
                raise ValueError("No speech detected in audio")

            elapsed_total = time.time() - start_time
            return full_transcription, elapsed_total
        except Exception as e:
            raise RuntimeError(f"Transcription failed: {e}")

