# Task 14.5 Solutions: Audio Transcription

This notebook contains solutions to the exercises in the Audio Transcription notebook.

---

## Challenge Solution: Streaming Transcription System

The challenge was to create a real-time streaming transcription system that processes audio in chunks.

### Approach
We implement a `StreamingTranscriber` class that:
1. **Chunks Audio**: Splits audio into fixed-size chunks (e.g., 5 seconds)
2. **Maintains Context**: Keeps overlap between chunks to avoid cutting words
3. **Processes Incrementally**: Transcribes each chunk and yields results immediately
4. **Tracks Statistics**: Monitors real-time factor to ensure we can keep up

### Key Design Decisions
- **Chunk Size**: 5 seconds balances latency vs context (longer = more accurate but higher latency)
- **Overlap**: 0.5 second overlap prevents word truncation at chunk boundaries
- **Buffering**: Stores incomplete audio for the next chunk

### Real-Time Factor (RTF)
RTF = processing_time / audio_duration
- RTF < 1.0: Faster than real-time (can handle live streaming)
- RTF > 1.0: Slower than real-time (will fall behind)

On DGX Spark with Whisper large-v3, we typically achieve RTF of 0.3-0.5x (3x faster than real-time).

In [None]:
import torch
import numpy as np
import gc
import time
from typing import List, Generator

def clear_gpu_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [None]:
# Load Whisper
from transformers import WhisperProcessor, WhisperForConditionalGeneration

print("Loading Whisper...")
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-large-v3",
    torch_dtype=torch.bfloat16  # Optimized for Blackwell
).to("cuda")
print("Loaded!")

In [None]:
class StreamingTranscriber:
    """
    Real-time streaming transcription system.
    
    Processes audio in chunks and provides incremental transcription,
    simulating a real-time streaming scenario.
    
    Example:
        >>> transcriber = StreamingTranscriber(chunk_size_seconds=5.0)
        >>> for chunk in audio_chunks:
        ...     text = transcriber.process_chunk(chunk)
        ...     print(text)
        >>> final = transcriber.get_full_transcript()
    """
    
    def __init__(
        self, 
        chunk_size_seconds: float = 5.0,
        sample_rate: int = 16000,
        overlap_seconds: float = 0.5
    ):
        """
        Initialize the streaming transcriber.
        
        Args:
            chunk_size_seconds: Size of each audio chunk to process
            sample_rate: Audio sample rate (must be 16000 for Whisper)
            overlap_seconds: Overlap between chunks to maintain context
        """
        self.chunk_size = int(chunk_size_seconds * sample_rate)
        self.overlap_size = int(overlap_seconds * sample_rate)
        self.sample_rate = sample_rate
        
        # Buffer for incomplete audio
        self.audio_buffer = np.array([], dtype=np.float32)
        
        # Store all transcriptions
        self.transcriptions: List[str] = []
        
        # Previous chunk's last bit for context
        self.previous_overlap = np.array([], dtype=np.float32)
        
        self.total_audio_duration = 0.0
        self.total_processing_time = 0.0
    
    def _transcribe_chunk(self, audio_chunk: np.ndarray) -> str:
        """
        Transcribe a single audio chunk.
        
        Args:
            audio_chunk: Audio data for this chunk
            
        Returns:
            Transcription text for this chunk
        """
        # Ensure correct format
        audio_chunk = audio_chunk.astype(np.float32)
        if audio_chunk.max() > 1.0:
            audio_chunk = audio_chunk / np.abs(audio_chunk).max()
        
        # Process with Whisper
        input_features = processor(
            audio_chunk,
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features.to(model.device, dtype=torch.float16)
        
        with torch.inference_mode():
            predicted_ids = model.generate(
                input_features,
                max_new_tokens=128  # Shorter for real-time
            )
        
        transcription = processor.batch_decode(
            predicted_ids,
            skip_special_tokens=True
        )[0]
        
        return transcription.strip()
    
    def process_chunk(self, audio_chunk: np.ndarray) -> str:
        """
        Process a single audio chunk and return its transcription.
        
        Args:
            audio_chunk: Audio data for this chunk
            
        Returns:
            Transcription for this chunk
        """
        start_time = time.time()
        
        # Add overlap from previous chunk for context
        if len(self.previous_overlap) > 0:
            audio_with_context = np.concatenate([self.previous_overlap, audio_chunk])
        else:
            audio_with_context = audio_chunk
        
        # Transcribe
        transcription = self._transcribe_chunk(audio_with_context)
        
        # Store transcription
        if transcription:
            self.transcriptions.append(transcription)
        
        # Save overlap for next chunk
        if len(audio_chunk) > self.overlap_size:
            self.previous_overlap = audio_chunk[-self.overlap_size:]
        else:
            self.previous_overlap = audio_chunk.copy()
        
        # Update stats
        chunk_duration = len(audio_chunk) / self.sample_rate
        processing_time = time.time() - start_time
        
        self.total_audio_duration += chunk_duration
        self.total_processing_time += processing_time
        
        return transcription
    
    def stream_audio(self, audio: np.ndarray) -> Generator[str, None, None]:
        """
        Stream through audio and yield transcriptions.
        
        Args:
            audio: Full audio to process
            
        Yields:
            Transcription for each chunk
        """
        # Split into chunks
        for i in range(0, len(audio), self.chunk_size):
            chunk = audio[i:i + self.chunk_size]
            
            # Skip very short final chunks
            if len(chunk) < self.sample_rate * 0.5:  # Less than 0.5 seconds
                continue
            
            transcription = self.process_chunk(chunk)
            yield transcription
    
    def get_full_transcript(self) -> str:
        """
        Get the combined transcript from all chunks.
        
        Returns:
            Full transcript as a single string
        """
        # Simple joining - in production, you'd use more sophisticated merging
        return " ".join(self.transcriptions)
    
    def get_stats(self) -> dict:
        """
        Get processing statistics.
        
        Returns:
            Dictionary with processing stats
        """
        return {
            'total_audio_duration': self.total_audio_duration,
            'total_processing_time': self.total_processing_time,
            'real_time_factor': self.total_processing_time / max(self.total_audio_duration, 0.001),
            'chunks_processed': len(self.transcriptions),
            'is_real_time': self.total_processing_time < self.total_audio_duration
        }
    
    def reset(self):
        """Reset the transcriber state."""
        self.audio_buffer = np.array([], dtype=np.float32)
        self.previous_overlap = np.array([], dtype=np.float32)
        self.transcriptions = []
        self.total_audio_duration = 0.0
        self.total_processing_time = 0.0

print("StreamingTranscriber class ready!")

In [None]:
# Test with synthetic audio (tones - will produce minimal transcription)

def generate_test_audio(duration: float, sample_rate: int = 16000) -> np.ndarray:
    """Generate test audio (tones)."""
    t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
    # Mix of frequencies
    audio = 0.3 * np.sin(2 * np.pi * 440 * t)  # A4
    audio += 0.3 * np.sin(2 * np.pi * 880 * t)  # A5
    return audio.astype(np.float32)

# Generate 30 seconds of test audio
test_audio = generate_test_audio(30.0)
print(f"Generated {len(test_audio) / 16000:.1f} seconds of test audio")

In [None]:
# Test streaming transcription
transcriber = StreamingTranscriber(chunk_size_seconds=5.0, overlap_seconds=0.5)

print("Streaming transcription simulation:")
print("=" * 50)

chunk_num = 0
for transcription in transcriber.stream_audio(test_audio):
    chunk_num += 1
    stats = transcriber.get_stats()
    print(f"\nChunk {chunk_num}:")
    print(f"  Transcription: '{transcription}'")
    print(f"  Audio: {stats['total_audio_duration']:.1f}s | Processing: {stats['total_processing_time']:.1f}s")
    print(f"  Real-time factor: {stats['real_time_factor']:.2f}x")

# Final stats
final_stats = transcriber.get_stats()
print("\n" + "=" * 50)
print("FINAL STATISTICS")
print("=" * 50)
print(f"Total audio duration: {final_stats['total_audio_duration']:.1f}s")
print(f"Total processing time: {final_stats['total_processing_time']:.1f}s")
print(f"Real-time factor: {final_stats['real_time_factor']:.2f}x")
print(f"Is real-time capable: {final_stats['is_real_time']}")
print(f"\nFull transcript: '{transcriber.get_full_transcript()}'")

In [None]:
# Meeting Minutes Generator (using pre-made transcript)

sample_meeting_transcript = """
Good morning everyone. Let's start the weekly status meeting. 

First, the development team update. We've completed the user authentication module 
and it's now in testing. John, can you share the timeline for the next sprint?

Sure. We're planning to start the dashboard redesign next Monday. The estimated 
completion is two weeks. We'll need design assets from the UX team by Friday.

Great. Sarah, what about the marketing side?

We're preparing for the product launch on January 15th. The press release is ready, 
and we've scheduled social media posts. Budget approval is pending from finance.

Okay, let's note that as an action item - follow up with finance about the budget.

Any blockers? No? Alright, meeting adjourned. Next meeting is Wednesday at 10 AM.
"""

class MeetingMinutesGenerator:
    """Generate meeting minutes from transcripts using an LLM."""
    
    def __init__(self):
        self.llm = None
        self.tokenizer = None
    
    def load_llm(self):
        if self.llm is not None:
            return
        
        from transformers import AutoTokenizer, AutoModelForCausalLM
        
        print("Loading LLM...")
        model_id = "Qwen/Qwen2.5-7B-Instruct"
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.llm = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
        print("LLM loaded!")
    
    def generate_minutes(self, transcript: str) -> dict:
        self.load_llm()
        
        prompts = {
            'summary': f"Summarize this meeting in 2-3 sentences:\n\n{transcript}",
            'action_items': f"Extract all action items from this meeting as a numbered list:\n\n{transcript}",
            'key_decisions': f"What key decisions were made in this meeting?\n\n{transcript}",
            'next_steps': f"What are the next steps mentioned in this meeting?\n\n{transcript}"
        }
        
        results = {}
        
        for key, prompt in prompts.items():
            messages = [
                {"role": "system", "content": "You are a helpful assistant that analyzes meeting transcripts."},
                {"role": "user", "content": prompt}
            ]
            
            text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            inputs = self.tokenizer(text, return_tensors="pt").to(self.llm.device)
            
            with torch.inference_mode():
                outputs = self.llm.generate(**inputs, max_new_tokens=200, temperature=0.7)
            
            response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
            results[key] = response
        
        return results

# Generate meeting minutes
generator = MeetingMinutesGenerator()
minutes = generator.generate_minutes(sample_meeting_transcript)

print("\n" + "=" * 60)
print("MEETING MINUTES")
print("=" * 60)

print("\n--- Summary ---")
print(minutes['summary'])

print("\n--- Action Items ---")
print(minutes['action_items'])

print("\n--- Key Decisions ---")
print(minutes['key_decisions'])

print("\n--- Next Steps ---")
print(minutes['next_steps'])

In [None]:
# Cleanup
del model, processor
if generator.llm is not None:
    del generator.llm, generator.tokenizer
clear_gpu_memory()
print("Solutions notebook complete!")