In [3]:
import numpy as np
import sounddevice as sd
import torch
import transformers
from scipy.signal import resample

In [9]:
import numpy as np
import sounddevice as sd
import torch
import transformers
from scipy.signal import resample

class LiveTranscriber:
    def __init__(self, model_name="openai/whisper-small", device=None):
        """
        Initialize the live transcription system.
        
        :param model_name: Hugging Face model for transcription
        :param device: Compute device (cuda/cpu)
        """
        # Determine optimal device and dtype
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device
        
        # Explicitly use float32 to avoid type conversion issues
        self.torch_dtype = torch.float32
        
        print(f"Using device: {self.device}")
        print(f"Using dtype: {self.torch_dtype}")
        
        # Load Whisper model and processor
        self.processor = transformers.WhisperProcessor.from_pretrained(model_name)
        self.model = transformers.WhisperForConditionalGeneration.from_pretrained(
            model_name, 
            torch_dtype=self.torch_dtype
        ).to(self.device)
        
        # Ensure model is in evaluation mode
        self.model.eval()
        
        # Audio stream configurations
        self.target_sample_rate = 16000  # Most ML models prefer 16kHz
        self.chunk_duration = 5  # Seconds of audio to transcribe at once
        self.chunk_samples = self.target_sample_rate * self.chunk_duration
        
        # Get system default sample rate
        self.system_sample_rate = sd.query_devices(kind='input')['default_samplerate']
        print(f"System Sample Rate: {self.system_sample_rate}")
        
    def audio_callback(self, indata, frames, time, status):
        """
        Callback function for processing audio stream
        
        :param indata: Input audio data
        :param frames: Number of frames
        :param time: Timestamp
        :param status: Stream status
        """
        if status:
            print(f"Stream error: {status}")
            return
        
        # Convert to numpy array and ensure mono
        audio = indata.flatten()
        
        # Resample to 16kHz if needed
        if self.system_sample_rate != self.target_sample_rate:
            audio = self._resample(audio, self.system_sample_rate, self.target_sample_rate)
        
        # Prepare audio for model
        input_features = self.processor(
            audio, 
            sampling_rate=self.target_sample_rate, 
            return_tensors="pt"
        ).input_features.to(device=self.device, dtype=self.torch_dtype)
        
        # Transcribe
        with torch.no_grad():
            predicted_ids = self.model.generate(input_features)
            transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        print(f"Transcription: {transcription}")
    
    def _resample(self, audio, orig_sr, target_sr):
        """
        Resample audio to target sample rate
        
        :param audio: Input audio array
        :param orig_sr: Original sample rate
        :param target_sr: Target sample rate
        :return: Resampled audio
        """
        # Calculate number of samples for resampling
        duration = len(audio) / orig_sr
        num_samples = int(duration * target_sr)
        
        # Use scipy's resample function
        return resample(audio, num_samples)
    
    def start_transcription(self):
        """
        Start live audio transcription with keyboard interrupt support
        """
        print(f"Starting transcription on {self.device}. Speak now...")
        print("Press Ctrl+C to stop transcription.")
        
        try:
            # Open audio stream
            with sd.InputStream(
                samplerate=self.system_sample_rate,
                channels=1,  # Mono
                dtype='float32',
                callback=self.audio_callback,
                blocksize=int(self.system_sample_rate * self.chunk_duration)
            ):
                # Keep the main thread running
                while True:
                    sd.sleep(1000)
        
        except KeyboardInterrupt:
            print("\nTranscription stopped by user.")

In [10]:
def main():
    transcriber = LiveTranscriber()
    transcriber.start_transcription()

if __name__ == "__main__":
    main()

Using device: cuda
Using dtype: torch.float32
System Sample Rate: 44100.0
Starting transcription on cuda. Speak now...
Press Ctrl+C to stop transcription.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcription:  Hello, how was this going on?
Transcription:  Hello, is the audio available?
Transcription:  Is it live or is it good? Is it bad? How fast is it?
Transcription:  it? Oh, until I speak it would not be possible to
Transcription:  Don't give the transcription until I end it.
Transcription:  Or does it have a length of time limit till which if I don't speak it won't give any line?
Transcription:  Okay, okay, then what to do?
Transcription:  I
Transcription:  Hmm.
Transcription:  चेरी अगे

Transcription stopped by user.
