In [1]:
import logging
import os

# set the directory to the location of the script
try:
    os.chdir("../..")
    target_directory = os.getenv(
        "TARGET_DIRECTORY", os.getcwd()
    )  # Use environment variable if available
    if os.path.exists(target_directory):
        os.chdir(target_directory)
        print(f"Changed directory to: {os.getcwd()}")
        logging.info(f"Successfully changed directory to: {os.getcwd()}")
    else:
        logging.error(f"Directory does not exist: {target_directory}")
except Exception as e:
    logging.exception(f"An error occurred while changing directory: {e}")

Changed directory to: c:\Users\pablosal\Desktop\gbb-ai-audio-agent


In [2]:
from src.speech.text_to_speech import SpeechSynthesizer
from src.speech.speech_recognizer import StreamingSpeechRecognizerFromBytes
from openai import AzureOpenAI

if "az_speech_recognizer_stream_client" not in locals():
    az_speech_recognizer_stream_client = StreamingSpeechRecognizerFromBytes(
        vad_silence_timeout_ms=800,
        use_semantic_segmentation=False,
        audio_format="pcm",
        candidate_languages=["en-US", "fr-FR", "de-DE", "es-ES", "it-IT"],
        enable_diarisation=True,
        speaker_count_hint=2,
        enable_neural_fe=False,
    )

if "az_speach_synthesizer_client" not in locals():
    az_speach_synthesizer_client = SpeechSynthesizer(voice="en-US-Ava:DragonHDLatestNeural")

# Ensure Azure OpenAI client is initialized only if not already defined
if "client" not in locals():
    client = AzureOpenAI(
        api_version="2025-02-01-preview",
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_KEY"),
    )

[2025-08-06 11:24:14,203] INFO - micro: Azure Monitor tracing initialized for speech recognizer
INFO:micro:Azure Monitor tracing initialized for speech recognizer
[2025-08-06 11:24:14,208] INFO - micro: Creating SpeechConfig with API key authentication
INFO:micro:Creating SpeechConfig with API key authentication
[2025-08-06 11:24:14,213] INFO - micro: Azure Monitor tracing initialized for speech synthesizer
INFO:micro:Azure Monitor tracing initialized for speech synthesizer
[2025-08-06 11:24:14,220] INFO - micro: Creating SpeechConfig with API key authentication
INFO:micro:Creating SpeechConfig with API key authentication
[2025-08-06 11:24:14,225] INFO - micro: Speech synthesizer initialized successfully
INFO:micro:Speech synthesizer initialized successfully


In [3]:
# Define end-of-sentence markers for TTS
TTS_ENDS = {".", "!", "?", ";", "\n"}

# Flags and buffers
is_synthesizing = False
user_buffer = ""
assistant_buffer = ""
tts_thread = None

In [None]:
import os, time, json, threading, logging
import azure.cognitiveservices.speech as speechsdk
import pyaudio                                                         # pip install pyaudio

def _extract_speaker(evt: speechsdk.SpeechRecognitionEventArgs) -> str | None:
    """Pull SpeakerId from the hidden JsonResult blob (if diarisation is enabled)."""
    blob = evt.result.properties.get(
        speechsdk.PropertyId.SpeechServiceResponse_JsonResult, "")
    if blob:
        try:
            return str(json.loads(blob).get("SpeakerId"))
        except Exception:
            pass
    return None


##############################################################################
# 3.  STT callbacks (with speaker tags)
##############################################################################
def _partial_evt(evt):
    global is_synthesizing
    if is_synthesizing:
        is_synthesizing = False
    spk = _extract_speaker(evt)
    tag = f"[S{spk}]" if spk is not None else ""
    print(f"🗣️ {tag} partial: {evt.result.text}")

def _final_evt(evt):
    global user_buffer
    spk = _extract_speaker(evt)
    tag = f"[S{spk}]" if spk is not None else ""
    print(f"🧾 {tag} final  : {evt.result.text}")
    user_buffer += evt.result.text.strip() + "\n"

# Hook directly into the underlying SpeechRecognizer events
# (this guarantees we receive the full event object)
rec = az_speech_recognizer_stream_client.speech_recognizer
if rec is None:                 # first run: build recogniser
    az_speech_recognizer_stream_client.prepare_start()
    rec = az_speech_recognizer_stream_client.speech_recognizer

rec.recognizing.connect(_partial_evt)
rec.recognized .connect(_final_evt)

##############################################################################
# 4.  Start recognition & mic streaming
##############################################################################
az_speech_recognizer_stream_client.start()
print("🎙️ Speak now…")

RATE, CHANNELS, CHUNK = 16000, 1, 1024
pa = pyaudio.PyAudio()
mic= pa.open(format=pyaudio.paInt16, channels=CHANNELS, rate=RATE,
             input=True, frames_per_buffer=CHUNK)

def mic_loop():
    while True:
        az_speech_recognizer_stream_client.write_bytes(
            mic.read(CHUNK, exception_on_overflow=False))
threading.Thread(target=mic_loop, daemon=True).start()

##############################################################################
# 5.  OpenAI chat ➜ Azure TTS
##############################################################################
def assistant_speak(text:str):
    global is_synthesizing
    is_synthesizing = True
    az_speach_synthesizer_client.start_speaking_text(text)

messages=[{"role":"system","content":"You are a helpful assistant."}]
while True:
    if user_buffer:
        messages.append({"role":"user","content":user_buffer})
        user_buffer = ""
        stream = client.chat.completions.create(
            stream=True, messages=messages,
            model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_ID"),
            max_tokens=4096, temperature=1.0, top_p=1.0)

        buf=[]
        for chunk in stream:
            part = getattr(chunk.choices[0].delta, "content", "")
            if part:
                buf.append(part)
                print(part, end="", flush=True)
                if part in TTS_ENDS:
                    assistant_speak("".join(buf).strip())
                    buf=[]
        print()
    time.sleep(0.1)

[2025-08-06 11:24:14,688] INFO - micro: Speech-SDK prepare_start – format=pcm  neuralFE=False  diar=True
INFO:micro:Speech-SDK prepare_start – format=pcm  neuralFE=False  diar=True
[2025-08-06 11:24:14,699] INFO - micro: Speech-SDK ready (neuralFE=False, diarisation=True, speakers=2)
INFO:micro:Speech-SDK ready (neuralFE=False, diarisation=True, speakers=2)
[2025-08-06 11:24:14,711] INFO - micro: Starting recognition from byte stream…


INFO:micro:Starting recognition from byte stream…
[2025-08-06 11:24:14,720] INFO - micro: Speech-SDK prepare_start – format=pcm  neuralFE=False  diar=True
INFO:micro:Speech-SDK prepare_start – format=pcm  neuralFE=False  diar=True
[2025-08-06 11:24:14,733] INFO - micro: Speech-SDK ready (neuralFE=False, diarisation=True, speakers=2)
INFO:micro:Speech-SDK ready (neuralFE=False, diarisation=True, speakers=2)
[2025-08-06 11:24:14,753] INFO - micro: Recognition started.
INFO:micro:Recognition started.


🎙️ Speak now…


KeyboardInterrupt: 

Exception in thread Thread-3 (mic_loop):
Traceback (most recent call last):
  File "c:\Users\pablosal\AppData\Local\anaconda3\envs\audioagent\Lib\threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "c:\Users\pablosal\AppData\Local\anaconda3\envs\audioagent\Lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\pablosal\AppData\Local\anaconda3\envs\audioagent\Lib\threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\pablosal\AppData\Local\Temp\ipykernel_39032\3547244998.py", line 59, in mic_loop
  File "c:\Users\pablosal\AppData\Local\anaconda3\envs\audioagent\Lib\site-packages\pyaudio\__init__.py", line 570, in read
    return pa.read_stream(self._stream, num_frames,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: [Errno -9999] Unanticipated host error


In [None]:
az_speech_recognizer_stream_client.close_stream()
az_speach_synthesizer_client.stop_speaking()

[2025-08-06 10:45:58,878] INFO - micro: [🛑] Stopping speech synthesis...
INFO:micro:[🛑] Stopping speech synthesis...


[2025-08-06 10:45:59,080] INFO - micro: Session stopped.
INFO:micro:Session stopped.
