In [None]:
import pyaudio
import webrtcvad
import numpy as np
class AudioRecorder:
    def __init__(self):
        # Audio configuration
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = 1
        self.RATE = 16000  # WebRTC VAD requires 16000Hz
        self.CHUNK = 320   # 30ms at 16000Hz - WebRTC VAD expects 10, 20, or 30ms frames
        self.SILENCE_THRESHOLD = 2  # Number of silent chunks before stopping
        
        # Initialize PyAudio
        self.audio = pyaudio.PyAudio()
        
        # Initialize VAD
        self.vad = webrtcvad.Vad()
        self.vad.set_mode(1)  # 0: Least aggressive, 3: Most aggressive
        
    def is_speech(self, frame):
        """Check if a frame contains speech."""
        try:
            return self.vad.is_speech(frame, self.RATE)
        except Exception as e:
            print(f"Error processing frame: {e}")
            return False
            
    def record_audio(self, silence_timeout=2):
        """Record audio when speech is detected."""
        frames = []
        recording = False
        silent_chunks = 0
        
        # Open stream
        stream = self.audio.open(
            format=self.FORMAT,
            channels=self.CHANNELS,
            rate=self.RATE,
            input=True,
            frames_per_buffer=self.CHUNK
        )
        
        print("Listening for speech...")
        
        try:
            while True:
                frame = stream.read(self.CHUNK, exception_on_overflow=False)
                
                # Check if frame contains speech
                is_speech = self.is_speech(frame)
                
                if is_speech:
                    if not recording:
                        print("Speech detected - Recording started.")
                        recording = True
                    frames.append(frame)
                    silent_chunks = 0
                elif recording:
                    silent_chunks += 1
                    frames.append(frame)
                    
                    # Stop recording after silence_timeout seconds of silence
                    if silent_chunks > (silence_timeout * self.RATE) // self.CHUNK:
                        print("Silence detected - Recording stopped.")
                        break
                        
        except KeyboardInterrupt:
            print("\nRecording interrupted by user")
        finally:
            # Clean up
            stream.stop_stream()
            stream.close()
            
        return frames
        
    def get_audio_data(self, frames):
        """
        Convert recorded frames to numpy array of audio data.
        """
        if not frames:
            print("No audio frames to process")
            return None
            
        try:
            # Combine all frames into a single bytes object
            audio_data = b''.join(frames)
            
            # Convert bytes to numpy array
            audio_samples = np.frombuffer(audio_data, dtype=np.int16)
            return audio_samples
            
        except Exception as e:
            print(f"Error processing audio data: {e}")
            return None
            
    def cleanup(self):
        """Clean up PyAudio resources."""
        self.audio.terminate()


In [None]:
recorder = AudioRecorder()
try:
    frames = recorder.record_audio()
    audio_data = recorder.get_audio_data(frames)
    if audio_data is not None and audio_data.size > 0:
        # Use audio_data with your speech-to-text system
        print(audio_data)
        # For example with speech_recognition:
        # recognizer.recognize_google(audio_data)

        from transformers import pipeline
        asr = pipeline("automatic-speech-recognition", 
                                model="openai/whisper-medium",
                                device='cpu')
        transcription = asr(audio_data)
        print("Transcription:", transcription['text'])

        pass
finally:
    recorder.cleanup()

In [None]:
import pyaudio
import webrtcvad
import numpy as np
from transformers import pipeline
import librosa
import soundfile as sf
import io

class AudioRecorder:
    def __init__(self):
        # Audio configuration
        self.FORMAT = pyaudio.paFloat32  # Changed to float32 for better compatibility
        self.CHANNELS = 1
        self.RATE = 16000  # WebRTC VAD requires 16000Hz
        self.CHUNK = 320   # 30ms at 16000Hz
        self.SILENCE_THRESHOLD = 2
        
        # Initialize PyAudio
        self.audio = pyaudio.PyAudio()
        
        # Initialize VAD
        self.vad = webrtcvad.Vad()
        self.vad.set_mode(1)
        
        # Initialize Whisper
        self.asr = pipeline("automatic-speech-recognition", 
                          model="openai/whisper-medium",
                          device='cuda')
    
    def is_speech(self, frame):
        """Check if a frame contains speech."""
        try:
            # Convert float32 to int16 for VAD
            frame_int16 = (np.frombuffer(frame, dtype=np.float32) * 32767).astype(np.int16).tobytes()
            return self.vad.is_speech(frame_int16, self.RATE)
        except Exception as e:
            print(f"Error processing frame: {e}")
            return False
            
    def record_audio(self, silence_timeout=2):
        """Record audio when speech is detected."""
        frames = []
        recording = False
        silent_chunks = 0
        
        stream = self.audio.open(
            format=self.FORMAT,
            channels=self.CHANNELS,
            rate=self.RATE,
            input=True,
            frames_per_buffer=self.CHUNK
        )
        
        print("Listening for speech...")
        
        try:
            while True:
                frame = stream.read(self.CHUNK, exception_on_overflow=False)
                
                if self.is_speech(frame):
                    if not recording:
                        print("Speech detected - Recording started.")
                        recording = True
                    frames.append(frame)
                    silent_chunks = 0
                elif recording:
                    silent_chunks += 1
                    frames.append(frame)
                    
                    if silent_chunks > (silence_timeout * self.RATE) // self.CHUNK:
                        print("Silence detected - Recording stopped.")
                        break
                        
        except KeyboardInterrupt:
            print("\nRecording interrupted by user")
        finally:
            stream.stop_stream()
            stream.close()
            
        return frames
        
    def prepare_audio_for_whisper(self, frames):
        """Convert and prepare audio data for Whisper model."""
        if not frames:
            print("No audio frames to process")
            return None
            
        try:
            # Combine frames and convert to numpy array
            audio_data = np.frombuffer(b''.join(frames), dtype=np.float32)
            
            # Normalize audio
            audio_data = audio_data / np.max(np.abs(audio_data))
            
            # Ensure sample rate is correct (Whisper expects 16kHz)
            if self.RATE != 16000:
                audio_data = librosa.resample(audio_data, orig_sr=self.RATE, target_sr=16000)
            
            return audio_data
            
        except Exception as e:
            print(f"Error processing audio data: {e}")
            return None
    
    def transcribe_audio(self, audio_data):
        """Transcribe audio using Whisper model."""
        try:
            if audio_data is not None and len(audio_data) > 0:
                # Process with Whisper
                transcription = self.asr({"sampling_rate": self.RATE, "raw": audio_data})
                return transcription['text']
            return None
        except Exception as e:
            print(f"Error during transcription: {e}")
            return None
            
    def cleanup(self):
        """Clean up PyAudio resources."""
        self.audio.terminate()


In [None]:
import asyncio
from langchain_ollama import ChatOllama
from queue import Queue
import re

class AsyncSentenceQueue:
    def __init__(self):
        self.queue = asyncio.Queue()
        self.current_sentence = ""
        
    async def put(self, text: str):
        """Add text and split into sentences when possible"""
        self.current_sentence += text
        sentences = re.split(r'([.!?]+)', self.current_sentence)
        
        # Process complete sentences
        while len(sentences) >= 2:  # Need both sentence content and separator
            sentence = sentences.pop(0) + sentences.pop(0)  # Combine with separator
            if sentence.strip():  # Only queue non-empty sentences
                await self.queue.put(sentence)
        
        # Store remaining incomplete sentence
        self.current_sentence = ''.join(sentences)
    
    async def get(self):
        """Get next complete sentence from queue"""
        return await self.queue.get()
    
    def task_done(self):
        """Mark a queue item as done"""
        self.queue.task_done()
    
    async def finish(self):
        """Put any remaining text into queue"""
        if self.current_sentence.strip():
            await self.queue.put(self.current_sentence)
            self.current_sentence = ""

async def generate_text(text: str, sentence_queue: AsyncSentenceQueue):
    """Generate text and put sentences into queue"""
    model = ChatOllama(model="llama3.2:1b")
    try:
        stream = model.stream(text)
        for chunk in stream:
            await sentence_queue.put(chunk.content)
            await asyncio.sleep(0)  # Yield control
        await sentence_queue.finish()  # Queue any remaining text
    except asyncio.CancelledError:
        print("\nOutput generation cancelled.")
        raise

async def display_queue(sentence_queue: AsyncSentenceQueue):
    """Display sentences from queue with delay for effect"""
    try:
        while True:
            sentence = await sentence_queue.get()
            print(sentence, end='', flush=True)
            await asyncio.sleep(0.5)  # Artificial delay for queuing effect
            sentence_queue.task_done()
    except asyncio.CancelledError:
        print("\nDisplay task cancelled.")
        raise

async def main():
    sentence_queue = AsyncSentenceQueue()
    
    # Create tasks
    generator_task = asyncio.create_task(generate_text("tell me a story", sentence_queue))
    display_task = asyncio.create_task(display_queue(sentence_queue))
    
    # Wait for 5 seconds before cancelling
    try:
        await asyncio.sleep(5)
        print("\nCancelling tasks...")
        generator_task.cancel()
        display_task.cancel()
        await asyncio.gather(generator_task, display_task, return_exceptions=True)
    except asyncio.CancelledError:
        print("Main task cancelled.")

In [None]:
def main():
    recorder = AudioRecorder()
    try:
        # Record audio
        frames = recorder.record_audio()
        
        # Prepare audio for Whisper
        audio_data = recorder.prepare_audio_for_whisper(frames)
        
        # Transcribe
        if audio_data is not None:
            transcription = recorder.transcribe_audio(audio_data)
            if transcription:
                print("Transcription:", transcription)
            else:
                print("No transcription available")
    finally:
        recorder.cleanup()

if __name__ == "__main__":
    main()