In [1]:
import logging
import os

# set the directory to the location of the script
try:
    os.chdir("../..")
    target_directory = os.getenv(
        "TARGET_DIRECTORY", os.getcwd()
    )  # Use environment variable if available
    if os.path.exists(target_directory):
        os.chdir(target_directory)
        print(f"Changed directory to: {os.getcwd()}")
        logging.info(f"Successfully changed directory to: {os.getcwd()}")
    else:
        logging.error(f"Directory does not exist: {target_directory}")
except Exception as e:
    logging.exception(f"An error occurred while changing directory: {e}")

Changed directory to: c:\Users\pablosal\Desktop\gbb-ai-audio-agent


## Azure AI Speech (VAD)

In [2]:
from src.speech.text_to_speech import SpeechSynthesizer
from src.speech.speech_recognizer import StreamingSpeechRecognizerFromBytes
from openai import AzureOpenAI

if "az_speech_recognizer_stream_client" not in locals():
    az_speech_recognizer_stream_client = StreamingSpeechRecognizerFromBytes(
        use_semantic_segmentation=False,
        vad_silence_timeout_ms=800,
        audio_format="pcm",
        candidate_languages=["en-US", "fr-FR", "de-DE", "es-ES", "it-IT"],

    )

if "az_speach_synthesizer_client" not in locals():
    az_speach_synthesizer_client = SpeechSynthesizer(voice="en-US-Ava:DragonHDLatestNeural", playback="always")

# Ensure Azure OpenAI client is initialized only if not already defined
if "client" not in locals():
    client = AzureOpenAI(
        api_version="2025-02-01-preview",
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_KEY"),
    )

2025-07-20 21:01:11,259 - micro - MainProcess - INFO     Speech synthesizer initialized successfully (text_to_speech.py:__init__:122)
INFO:micro:Speech synthesizer initialized successfully


In [3]:
# Define end-of-sentence markers for TTS
TTS_ENDS = {".", "!", "?", ";", "\n"}

# Flags and buffers
is_synthesizing = False
user_buffer = ""
assistant_buffer = ""
tts_thread = None

In [4]:
import os, time, threading

def on_final(text, lang):
    global user_buffer
    user_buffer += text.strip() + "\n"
    print(f"🧾 User (final) in {lang}: {text}")


def assistant_speak(text):
    global is_synthesizing
    print("Hi there, I am a assistant_speak callback!")
    is_synthesizing = True
    print("Syntetixing:", is_synthesizing)
    az_speach_synthesizer_client.start_speaking_text(text)


def on_partial(text, lang):
    global is_synthesizing
    if is_synthesizing:
        # az_speach_synthesizer_client.stop_speaking()
        is_synthesizing = False
    print(f"🗣️ User (partial) in {lang}: {text}")


az_speech_recognizer_stream_client.set_partial_result_callback(on_partial)
az_speech_recognizer_stream_client.set_final_result_callback(on_final)

# Start recognition
az_speech_recognizer_stream_client.start()
print("🎙️ Speak now...")

# Start mic streaming thread
import pyaudio

RATE, CHANNELS, CHUNK = 16000, 1, 1024
audio = pyaudio.PyAudio()
stream = audio.open(
    format=pyaudio.paInt16,
    channels=CHANNELS,
    rate=RATE,
    input=True,
    frames_per_buffer=CHUNK,
)


def mic_loop():
    while True:
        data = stream.read(CHUNK, exception_on_overflow=False)
        az_speech_recognizer_stream_client.write_bytes(data)


threading.Thread(target=mic_loop, daemon=True).start()

messages = [{"role": "system", "content": "You are a helpful assistant."}]

user_buffer = ""  # This should be filled in by your STT callback as before

try:
    while True:
        if user_buffer:
            full_conversation = (
                "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
                + f"\nUser: {user_buffer}"
            )
            messages.append({"role": "user", "content": user_buffer})
            user_buffer = ""  # clear after using

            completion = client.chat.completions.create(
                stream=True,
                messages=messages,
                max_tokens=4096,
                temperature=1.0,
                top_p=1.0,
                model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_ID"),
            )

            collected_messages = []
            last_tts_request = None

            for chunk in completion:
                if chunk.choices and hasattr(chunk.choices[0].delta, "content"):
                    chunk_text = chunk.choices[0].delta.content
                    if chunk_text:
                        collected_messages.append(chunk_text)
                        print(chunk_text, end="", flush=True)
                        # Check for sentence end to stream to TTS
                        if chunk_text in TTS_ENDS:
                            text = "".join(collected_messages).strip()
                            last_tts_request = assistant_speak(text)
                            collected_messages.clear()
            print()  # finish line after streaming LLM response

        time.sleep(0.1)
except KeyboardInterrupt:
    print("Exiting...")

finally:
    az_speech_recognizer_stream_client.stop()
    az_speach_synthesizer_client.stop_speaking()
    print("Stream stopped and audio terminated.")


2025-07-20 21:01:12,659 - micro - MainProcess - INFO     Starting recognition from byte stream... (speech_recognizer.py:start:91)
INFO:micro:Starting recognition from byte stream...
2025-07-20 21:01:12,693 - micro - MainProcess - INFO     Recognition started. (speech_recognizer.py:start:153)
INFO:micro:Recognition started.


🎙️ Speak now...
🗣️ User (partial) in en-US: umm look
🗣️ User (partial) in en-US: umm look i'm trying
🗣️ User (partial) in en-US: umm look i'm trying to
🗣️ User (partial) in en-US: umm look i'm trying to explain
🗣️ User (partial) in en-US: umm look i'm trying to explain madrid
🗣️ User (partial) in en-US: umm look i'm trying to explain madrid uh but
🗣️ User (partial) in en-US: umm look i'm trying to explain madrid uh but i mean
🗣️ User (partial) in en-US: umm look i'm trying to explain madrid uh but i mean i don't
🗣️ User (partial) in en-US: umm look i'm trying to explain madrid uh but i mean i don't know
🗣️ User (partial) in en-US: umm look i'm trying to explain madrid uh but i mean i don't know if
🗣️ User (partial) in en-US: umm look i'm trying to explain madrid uh but i mean i don't know if you
🗣️ User (partial) in en-US: umm look i'm trying to explain madrid uh but i mean i don't know if you are able
🗣️ User (partial) in en-US: umm look i'm trying to explain madrid uh but i mean i do

2025-07-20 21:01:39,399 - micro - MainProcess - INFO     [🔊] Starting streaming speech synthesis for text: Of course, I can help you with that!... (text_to_speech.py:start_speaking_text:270)
INFO:micro:[🔊] Starting streaming speech synthesis for text: Of course, I can help you with that!...


Hi there, I am a assistant_speak callback!
Syntetixing: True
 It sounds like you're trying to describe Madrid.

2025-07-20 21:01:39,421 - micro - MainProcess - INFO     [🔊] Starting streaming speech synthesis for text: It sounds like you're trying to describe Madrid.... (text_to_speech.py:start_speaking_text:270)
INFO:micro:[🔊] Starting streaming speech synthesis for text: It sounds like you're trying to describe Madrid....


Hi there, I am a assistant_speak callback!
Syntetixing: True
 What specific aspects of the city do you want to explain?

2025-07-20 21:01:39,451 - micro - MainProcess - INFO     [🔊] Starting streaming speech synthesis for text: What specific aspects of the city do you want to e... (text_to_speech.py:start_speaking_text:270)
INFO:micro:[🔊] Starting streaming speech synthesis for text: What specific aspects of the city do you want to e...


Hi there, I am a assistant_speak callback!
Syntetixing: True
 For example, are you looking to talk about its culture, history, attractions, or maybe its cuisine?

2025-07-20 21:01:39,808 - micro - MainProcess - INFO     [🔊] Starting streaming speech synthesis for text: For example, are you looking to talk about its cul... (text_to_speech.py:start_speaking_text:270)
INFO:micro:[🔊] Starting streaming speech synthesis for text: For example, are you looking to talk about its cul...


Hi there, I am a assistant_speak callback!
Syntetixing: True
 Let me know how I can assist you!

2025-07-20 21:01:39,846 - micro - MainProcess - INFO     [🔊] Starting streaming speech synthesis for text: Let me know how I can assist you!... (text_to_speech.py:start_speaking_text:270)
INFO:micro:[🔊] Starting streaming speech synthesis for text: Let me know how I can assist you!...


Hi there, I am a assistant_speak callback!
Syntetixing: True

🗣️ User (partial) in es-ES: no yo
🗣️ User (partial) in es-ES: no yo no
🗣️ User (partial) in es-ES: no yo no creo
🗣️ User (partial) in es-ES: no yo no creo que
🗣️ User (partial) in es-ES: no yo no creo que te inter
🗣️ User (partial) in es-ES: no yo no creo que te enteres de nada
🗣️ User (partial) in es-ES: no yo no creo que te enteres de nada de
🗣️ User (partial) in es-ES: no yo no creo que te enteres de nada de lo que
🗣️ User (partial) in es-ES: no yo no creo que te enteres de nada de lo que te estoy
🗣️ User (partial) in es-ES: no yo no creo que te enteres de nada de lo que te estoy hablando
🗣️ User (partial) in es-ES: no yo no creo que te enteres de nada de lo que te estoy hablando porque
🗣️ User (partial) in es-ES: no yo no creo que te enteres de nada de lo que te estoy hablando porque no no
🗣️ User (partial) in es-ES: no yo no creo que te enteres de nada de lo que te estoy hablando porque no no escuchas
🗣️ User (partial) 

2025-07-20 21:02:07,184 - micro - MainProcess - INFO     [🔊] Starting streaming speech synthesis for text: Entiendo que puede ser frustrante.... (text_to_speech.py:start_speaking_text:270)
INFO:micro:[🔊] Starting streaming speech synthesis for text: Entiendo que puede ser frustrante....


Hi there, I am a assistant_speak callback!
Syntetixing: True
 Estoy aquí para ayudarte a explicar lo que quieras sobre Madrid.

2025-07-20 21:02:07,206 - micro - MainProcess - INFO     [🔊] Starting streaming speech synthesis for text: Estoy aquí para ayudarte a explicar lo que quieras... (text_to_speech.py:start_speaking_text:270)
INFO:micro:[🔊] Starting streaming speech synthesis for text: Estoy aquí para ayudarte a explicar lo que quieras...


Hi there, I am a assistant_speak callback!
Syntetixing: True
 Si me das un poco más de contexto o detalles sobre lo que te gustaría explicar, podría ser más útil.

2025-07-20 21:02:07,730 - micro - MainProcess - INFO     [🔊] Starting streaming speech synthesis for text: Si me das un poco más de contexto o detalles sobre... (text_to_speech.py:start_speaking_text:270)
INFO:micro:[🔊] Starting streaming speech synthesis for text: Si me das un poco más de contexto o detalles sobre...


Hi there, I am a assistant_speak callback!
Syntetixing: True
 ¿Te gustaría hablar sobre la cultura, la historia, los lugares turísticos, o algo más específico?

2025-07-20 21:02:07,767 - micro - MainProcess - INFO     [🔊] Starting streaming speech synthesis for text: ¿Te gustaría hablar sobre la cultura, la historia,... (text_to_speech.py:start_speaking_text:270)
INFO:micro:[🔊] Starting streaming speech synthesis for text: ¿Te gustaría hablar sobre la cultura, la historia,...


Hi there, I am a assistant_speak callback!
Syntetixing: True



2025-07-20 21:02:18,998 - micro - MainProcess - INFO     Session stopped. (speech_recognizer.py:_on_session_stopped:282)
INFO:micro:Session stopped.
2025-07-20 21:02:19,004 - micro - MainProcess - INFO     Recognition stopped. (speech_recognizer.py:stop:228)
INFO:micro:Recognition stopped.
2025-07-20 21:02:19,004 - micro - MainProcess - INFO     [🛑] Stopping speech synthesis... (text_to_speech.py:stop_speaking:291)
INFO:micro:[🛑] Stopping speech synthesis...


Exiting...
Stream stopped and audio terminated.


In [5]:
import os, azure.cognitiveservices.speech as speechsdk

speech_key   = os.getenv("AZURE_SPEECH_KEY")
speech_region = os.getenv("AZURE_SPEECH_REGION")

speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=speech_region)

# --- 1) segmentation knobs -------------------------------------------------
speech_config.set_property(          # content-aware breaks
    speechsdk.PropertyId.Speech_SegmentationStrategy, "Semantic")
speech_config.set_property(          # max 60 s per phrase fallback
    speechsdk.PropertyId.Speech_SegmentationMaximumTimeMs, "60000")
speech_config.set_property(          # 300 ms silence guard (ignored for Semantic
    speechsdk.PropertyId.Speech_SegmentationSilenceTimeoutMs, "300")

# optional: Stable partials every word
speech_config.set_property(
    speechsdk.PropertyId.SpeechServiceResponse_StablePartialResultThreshold, "1")

# --- 2) PushAudioInputStream from the mic (16-kHz PCM) ---------------------
import pyaudio, threading, time
RATE, CHUNK = 16_000, 1024
pa = pyaudio.PyAudio()
mic = pa.open(format=pyaudio.paInt16, channels=1, rate=RATE,
              input=True, frames_per_buffer=CHUNK)

stream_format = speechsdk.audio.AudioStreamFormat(samples_per_second=RATE,
                                                  bits_per_sample=16, channels=1)
push_stream = speechsdk.audio.PushAudioInputStream(stream_format)
audio_config = speechsdk.audio.AudioConfig(stream=push_stream)

recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,
                                        audio_config=audio_config)

In [6]:
recognizer

<azure.cognitiveservices.speech.SpeechRecognizer at 0x1f0c16292d0>

In [10]:
import os, azure.cognitiveservices.speech as speechsdk

speech_key   = os.getenv("AZURE_SPEECH_KEY")
speech_region = os.getenv("AZURE_SPEECH_REGION")

speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=speech_region)

# --- 1) segmentation knobs -------------------------------------------------
speech_config.set_property(          # content-aware breaks
    speechsdk.PropertyId.Speech_SegmentationStrategy, "Semantic")
speech_config.set_property(          # max 60 s per phrase fallback
    speechsdk.PropertyId.Speech_SegmentationMaximumTimeMs, "60000")
# speech_config.set_property(          # 300 ms silence guard (ignored for Semantic
#     speechsdk.PropertyId.Speech_SegmentationSilenceTimeoutMs, "300")

# optional: Stable partials every word
speech_config.set_property(
    speechsdk.PropertyId.SpeechServiceResponse_StablePartialResultThreshold, "1")

# --- 2) PushAudioInputStream from the mic (16-kHz PCM) ---------------------
import pyaudio, threading, time
RATE, CHUNK = 16_000, 1024
pa = pyaudio.PyAudio()
mic = pa.open(format=pyaudio.paInt16, channels=1, rate=RATE,
              input=True, frames_per_buffer=CHUNK)

stream_format = speechsdk.audio.AudioStreamFormat(samples_per_second=RATE,
                                                  bits_per_sample=16, channels=1)
push_stream = speechsdk.audio.PushAudioInputStream(stream_format)
audio_config = speechsdk.audio.AudioConfig(stream=push_stream)

recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,
                                        audio_config=audio_config)

def mic_loop():
    while running:
        data = mic.read(CHUNK, exception_on_overflow=False)
        push_stream.write(data)

running = True
threading.Thread(target=mic_loop, daemon=True).start()

# --- 3) handlers ------------------------------------------------------------
def on_partial(evt):
    print(f"[partial] {evt.result.text}")

def on_final(evt):
    print(f"[final]   {evt.result.text}")

recognizer.recognizing.connect(on_partial)
recognizer.recognized.connect(on_final)
recognizer.session_stopped.connect(lambda evt: print("Session stopped"))

recognizer.start_continuous_recognition_async().get()
print("🎤 Speak; Ctrl-C to exit")
try:
    while True: time.sleep(0.5)
except KeyboardInterrupt:
    running = False
    recognizer.stop_continuous_recognition_async().get()
    mic.close(); pa.terminate()

🎤 Speak; Ctrl-C to exit
Session stopped


In [5]:
import azure.cognitiveservices.speech as speechsdk
print(speechsdk.__version__)

1.44.0


In [6]:
!pip install azure-cognitiveservices-speech==1.45.0

Collecting azure-cognitiveservices-speech==1.45.0
  Downloading azure_cognitiveservices_speech-1.45.0-py3-none-win_amd64.whl.metadata (1.6 kB)
Downloading azure_cognitiveservices_speech-1.45.0-py3-none-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   -------------------------------------- - 2.4/2.4 MB 14.9 MB/s eta 0:00:01
   ---------------------------------------- 2.4/2.4 MB 12.6 MB/s eta 0:00:00
Installing collected packages: azure-cognitiveservices-speech
  Attempting uninstall: azure-cognitiveservices-speech
    Found existing installation: azure-cognitiveservices-speech 1.44.0
    Uninstalling azure-cognitiveservices-speech-1.44.0:
      Successfully uninstalled azure-cognitiveservices-speech-1.44.0
Successfully installed azure-cognitiveservices-speech-1.45.0


  You can safely remove it manually.


In [1]:
import azure.cognitiveservices.speech as s; print(s.__version__)

1.45.0


In [None]:
# !pip install torch>=2.6.0
# !pip install torchaudio>=2.6.0

In [None]:
import torch, torchaudio, IPython
from src.vad.vad_iterator import VADIterator, int2float

# ----------------- 2. Load model -----------------
vad_model, utils = torch.hub.load('snakers4/silero-vad', 'silero_vad')
vad_iter = VADIterator(vad_model, threshold=0.5, sampling_rate=16000,
                       min_silence_duration_ms=150, speech_pad_ms=60)

# ----------------- 3. Read a WAV -----------------
wav, sr = torchaudio.load('./samples/labs/test')      # mono, 16-kHz PCM
assert sr == 16000, "Resample or pick a 16-kHz file"

# ----------------- 4. Stream through VAD -----------------
CHUNK = int(0.03 * sr)   # 30 ms
segments = []
for i in range(0, wav.shape[1], CHUNK):
    frame = wav[:, i:i+CHUNK]
    out = vad_iter(frame)
    if out is not None:               # finished utterance
        seg = torch.cat(out, dim=1)   # stitch frames
        segments.append(seg)
        print(f"Segment {len(segments)} | {seg.shape[1]/sr:.2f} s")

# ----------------- 5. Listen back -----------------
for i, seg in enumerate(segments, 1):
    print(f"▶️  Segment {i}")
    IPython.display.display(IPython.display.Audio(seg.squeeze(), rate=sr))



Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\pablosal/.cache\torch\hub\master.zip


RuntimeError: Couldn't find appropriate backend to handle uri ./samples/labs/test_audio.wav and format None.