### Text Stopping

In [3]:
import asyncio
from langchain_ollama import ChatOllama

model = ChatOllama(model="llama3.2:1b")

# Convert a regular generator to an asynchronous generator
async def async_stream(generator):
    for item in generator:
        yield item
        await asyncio.sleep(0)  # Allows control back to the event loop for async compatibility

async def generate_text(text: str):
    try:
        stream = async_stream(model.stream(text))
        async for line in stream:
            print(line.content, end='', flush=True)
    except asyncio.CancelledError:
        print("Output generation cancelled.")

async def main():
    task = asyncio.create_task(generate_text("tell me a story"))
    await asyncio.sleep(2)  # Wait for 2 seconds before canceling the task
    print("Cancelling the task...")
    task.cancel()
    try:
        await task
    except asyncio.CancelledError:
        print("Task was cancelled successfully.")

# Directly await in Jupyter notebook
await main()


Once upon a time, in a small village nestled in the rolling hills of Tuscany, there was a tiny shop called "La Bottega dei Sogni" (The Shop of Dreams). It was owned by an elderly woman named SignCancelling the task...
Output generation cancelled.


### Speech to Text

In [30]:
import numpy as np
import sounddevice as sd
from transformers import pipeline
import time

# Load the ASR model
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small",device='cpu')

# Recording parameters
sample_rate = 16000
pause_threshold = 0.5  # seconds of silence to consider as a pause
energy_threshold = 0.02  # threshold for audio energy
max_duration = 3  

def record_and_transcribe():
    print("Listening...")
    audio_buffer = []  # Buffer to hold recorded audio
    start_time = time.time()  # Record the start time

    while True:
        # Record a short chunk of audio
        audio_chunk = sd.rec(int(sample_rate * 0.5), samplerate=sample_rate, channels=1, dtype='float32')
        sd.wait()  # Wait for the recording to finish

        # Calculate the energy of the audio chunk
        audio_energy = np.mean(np.abs(audio_chunk))

        # Append the audio chunk to the buffer only if it exceeds the energy threshold
        if audio_energy > energy_threshold:
            audio_buffer.append(audio_chunk.flatten())

        # Check if the recording duration has exceeded max_duration
        if time.time() - start_time > max_duration:
            break

        # Check if the last chunk was above the threshold
        if len(audio_buffer) > 0:
            last_energy = np.mean(np.abs(audio_buffer[-1]))
            if last_energy < energy_threshold:
                pause_start = time.time()  # Start counting silence duration

                # Continue checking for silence for pause_threshold seconds
                while True:
                    audio_chunk = sd.rec(int(sample_rate * 0.5), samplerate=sample_rate, channels=1, dtype='float32')
                    sd.wait()  # Wait for the recording to finish
                    
                    audio_energy = np.mean(np.abs(audio_chunk))
                    
                    if audio_energy > energy_threshold:
                        print("Speaker resumed, continuing recording.")
                        audio_buffer.append(audio_chunk.flatten())
                        break
                    
                    if time.time() - pause_start >= pause_threshold:
                        print("Pause detected, stopping recording.")
                        break

    # Convert the audio buffer to a single numpy array
    if audio_buffer:  # Ensure there's audio data to process
        audio_data = np.concatenate(audio_buffer)

        # Transcribe the recorded audio
        transcription = asr(audio_data)
        print("Transcription:", transcription['text'])
    else:
        print("No speech detected during the recording.")

if __name__ == "__main__":
    record_and_transcribe()


Listening...
Transcription:  Hello.


In [32]:
import numpy as np
import sounddevice as sd
from transformers import pipeline
import time
from collections import deque
from threading import Thread, Event
import queue

class AudioProcessor:
    def __init__(self, 
                 sample_rate=16000,
                 chunk_duration=0.1,  # Smaller chunks for faster response
                 pause_threshold=0.5,
                 energy_threshold=0.02,
                 max_duration=3):
        
        self.sample_rate = sample_rate
        self.chunk_duration = chunk_duration
        self.chunk_size = int(sample_rate * chunk_duration)
        self.pause_threshold = pause_threshold
        self.energy_threshold = energy_threshold
        self.max_duration = max_duration
        
        # Use queue for thread-safe audio processing
        self.audio_queue = queue.Queue()
        self.stop_recording = Event()
        
        # Initialize ASR model only once
        self.asr = pipeline("automatic-speech-recognition", 
                          model="openai/whisper-small",
                          device='cpu')
        
        # Use deque with maxlen for automatic memory management
        max_chunks = int(max_duration / chunk_duration)
        self.audio_buffer = deque(maxlen=max_chunks)

    def _calculate_energy(self, audio_chunk):
        # Vectorized energy calculation
        return np.mean(np.abs(audio_chunk))

    def _record_audio(self):
        """Record audio in a separate thread"""
        with sd.InputStream(samplerate=self.sample_rate,
                          channels=1,
                          dtype=np.float32,
                          blocksize=self.chunk_size,
                          callback=self._audio_callback):
            self.stop_recording.wait()

    def _audio_callback(self, indata, frames, time_info, status):
        """Callback for audio stream processing"""
        if status:
            print(f'Error: {status}')
        self.audio_queue.put(indata.copy())

    def _process_audio_chunk(self, audio_chunk):
        """Process a single chunk of audio data"""
        energy = self._calculate_energy(audio_chunk)
        if energy > self.energy_threshold:
            self.audio_buffer.append(audio_chunk.flatten())
            return True
        return False

    def record_and_transcribe(self):
        """Main method to record and transcribe audio"""
        print("Listening...")
        
        # Start recording thread
        recording_thread = Thread(target=self._record_audio)
        recording_thread.start()
        
        start_time = time.time()
        silence_start = None
        
        try:
            while True:
                # Check max duration
                if time.time() - start_time > self.max_duration:
                    break

                # Get audio chunk from queue with timeout
                try:
                    audio_chunk = self.audio_queue.get(timeout=0.1)
                except queue.Empty:
                    continue

                # Process the chunk
                has_speech = self._process_audio_chunk(audio_chunk)
                
                # Pause detection logic
                if not has_speech:
                    if silence_start is None:
                        silence_start = time.time()
                    elif time.time() - silence_start >= self.pause_threshold:
                        break
                else:
                    silence_start = None

        finally:
            # Clean up
            self.stop_recording.set()
            recording_thread.join()

        # Process recorded audio
        if len(self.audio_buffer) > 0:
            # Efficient concatenation of all audio chunks
            audio_data = np.concatenate(self.audio_buffer)
            
            # Transcribe
            transcription = self.asr(audio_data)
            print("Transcription:", transcription['text'])
            return transcription['text']
        else:
            print("No speech detected during the recording.")
            return ""

if __name__ == "__main__":
    processor = AudioProcessor()
    processor.record_and_transcribe()

Listening...
Transcription:  Hello, hello, hello.


### Text to speech

In [20]:
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import numpy as np
import sounddevice as sd

class TTSEngine:
    def __init__(self):
        self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
        self.speaker_embeddings = torch.tensor(
            load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7]["xvector"]
        ).unsqueeze(0)
        
    def speak(self, text):
        inputs = self.processor(text=text, return_tensors="pt")
        speech = self.model.generate_speech(
            inputs["input_ids"], 
            self.speaker_embeddings, 
            vocoder=self.vocoder
        )
        audio_data = speech.numpy() / np.max(np.abs(speech.numpy()))
        sd.play(audio_data, samplerate=16500)
        sd.wait()

# Usage
if __name__ == "__main__":
    tts = TTSEngine()
    tts.speak("Once upon a time, in a small village nestled between two great mountains, there lived a young girl named Aria.")

In [39]:
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import numpy as np
import sounddevice as sd

class TTSEngine:
    # Different speaker indices from CMU Arctic dataset
    VOICE_TYPES = {
        'bdl': 0,    # Male voice (BDL) - Deep broadcast voice
        'rms': 1,    # Male voice (RMS) - Professional narrative voice
        'jmk': 2,    # Male voice (JMK) - Clear articulate voice
        'awb': 3,    # Male voice (AWB) - Scottish accent
        'ksp': 4,    # Male voice (KSP) - Energetic voice
        'rxr': 5,    # Male voice (RXR) - Deeper resonant voice
        'aew': 6,    # Male voice (AEW) - Natural conversational voice
        'fem': 7     # Male voice (FEM) - Smooth tenor voice
    }
    
    def __init__(self):
        self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
        self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
        self.current_voice = 'bdl'
        self.set_voice(self.current_voice)
        
    def set_voice(self, voice_type):
        if voice_type not in self.VOICE_TYPES:
            raise ValueError(f"Voice type must be one of: {list(self.VOICE_TYPES.keys())}")
        voice_idx = self.VOICE_TYPES[voice_type]
        self.speaker_embeddings = torch.tensor(
            self.embeddings_dataset[voice_idx]["xvector"]
        ).unsqueeze(0)
        self.current_voice = voice_type
        
    def speak(self, text, voice_type=None):
        if voice_type:
            self.set_voice(voice_type)
            
        inputs = self.processor(text=text, return_tensors="pt")
        speech = self.model.generate_speech(
            inputs["input_ids"], 
            self.speaker_embeddings, 
            vocoder=self.vocoder
        )
        audio_data = speech.numpy() / np.max(np.abs(speech.numpy()))
        sd.play(audio_data, samplerate=17000)
        sd.wait()

# Usage example
if __name__ == "__main__":
    tts = TTSEngine()
    
    # Test all voices with the same text
    test_text = """Once upon a time, in a small village nestled between two great mountains, there lived a young girl named Aria."""
    tts.speak(test_text, "rms")

In [22]:
tts.speak(test_text, "fem")

In [23]:
tts.speak(test_text, "ksp")

In [24]:
tts.speak(test_text, "rxr")

In [25]:
tts.speak(test_text, "bdl")

In [26]:
tts.speak(test_text, "rms")