In [1]:
import logging
import os

# set the directory to the location of the script
try:
    os.chdir("..")
    target_directory = os.getenv(
        "TARGET_DIRECTORY", os.getcwd()
    )  # Use environment variable if available
    if os.path.exists(target_directory):
        os.chdir(target_directory)
        print(f"Changed directory to: {os.getcwd()}")
        logging.info(f"Successfully changed directory to: {os.getcwd()}")
    else:
        logging.error(f"Directory does not exist: {target_directory}")
except Exception as e:
    logging.exception(f"An error occurred while changing directory: {e}")

Changed directory to: c:\Users\pablosal\Desktop\gbb-ai-audio-agent


In [2]:
from src.speech.speech_recognizer import SpeechRecognizer, StreamingSpeechRecognizer
from src.speech.text_to_speech import SpeechSynthesizer
from src.speech.speech_recognizer import StreamingSpeechRecognizerFromBytes
from openai import AzureOpenAI

# Ensure clients are initialized only if not already defined
if "az_speech_recognizer_client" not in locals():
    az_speech_recognizer_client = SpeechRecognizer()

if "az_speech_recognizer_stream_client" not in locals():
    az_speech_recognizer_stream_client = StreamingSpeechRecognizerFromBytes(
        vad_silence_timeout_ms=800,
        audio_format = 'pcm',
        candidate_languages = ["en-US", "fr-FR", "de-DE", "es-ES", "it-IT"],
    )

if "az_speach_synthesizer_client" not in locals():
    az_speach_synthesizer_client = SpeechSynthesizer()

# Ensure Azure OpenAI client is initialized only if not already defined
if "client" not in locals():
    client = AzureOpenAI(
        api_version="2025-02-01-preview",
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_KEY"),
    )


In [3]:
# Define end-of-sentence markers for TTS
TTS_ENDS = {".", "!", "?", ";", "\n"}

# Flags and buffers
is_synthesizing = False
user_buffer = ""
assistant_buffer = ""
tts_thread = None

In [None]:
az_speach_synthesizer_client.stop_speaking()

Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): yaksa punbo single gaminenye sir machinimche of course here's the description in the four requested languages french asterisk ville anime rich and histoire aren't vibrant cuisine delicious english
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): yaksa punbo single gaminenye sir machinimche of course here's the description in the four requested languages french asterisk ville anime rich and histoire aren't vibrant cuisine delicious english and
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): yaksa punbo single gaminenye sir machinimche of course here's the description in the four requested languages french asterisk ville anime rich and histoire aren't vibrant cuisine delicious english word what can i do
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): yaksa punbo single gaminenye sir machinimche of course here's the 

In [6]:
az_speech_recognizer_stream_client.close_stream()

Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): this is pretty cool umm so pretty much what we're gonna be doing is like on the device audio controls like start speaking text for example and stop speaking we need to just map that


2025-06-17 12:17:22,078 - micro - MainProcess - INFO     Session stopped. (speech_recognizer.py:_on_session_stopped:356)
INFO:micro:Session stopped.


🧾 User (final): This is pretty cool. Umm, so pretty much what we're going to be doing is like on the device audio controls, like start speaking text for example, and stop speaking. We need to just map that.


In [None]:
import os, time, threading

def on_final(text, lang):
    global user_buffer
    user_buffer += text.strip() + "\n"
    print(f"🧾 User (final): {text}")

def assistant_speak(text):
    global is_synthesizing
    print("Hi there, I am a assistant_speak callback!")
    is_synthesizing = True
    print("Syntetixing:", is_synthesizing)
    az_speach_synthesizer_client.start_speaking_text(text)

def on_partial(text, lang):
    print("Hi there, I am a partial result callback!")
    global is_synthesizing
    print("Syntetixing:", is_synthesizing)
    if is_synthesizing:
        #az_speach_synthesizer_client.stop_speaking()
        is_synthesizing = False
    print(f"🗣️ User (partial): {text}")

az_speech_recognizer_stream_client.set_partial_result_callback(on_partial)
az_speech_recognizer_stream_client.set_final_result_callback(on_final)

# Start recognition
az_speech_recognizer_stream_client.start()
print("🎙️ Speak now...")

# Start mic streaming thread
import pyaudio
RATE, CHANNELS, CHUNK = 16000, 1, 1024
audio = pyaudio.PyAudio()
stream = audio.open(format=pyaudio.paInt16, channels=CHANNELS, rate=RATE,
                    input=True, frames_per_buffer=CHUNK)

def mic_loop():
    while True:
        data = stream.read(CHUNK, exception_on_overflow=False)
        az_speech_recognizer_stream_client.write_bytes(data)

threading.Thread(target=mic_loop, daemon=True).start()

messages = [{"role":"system","content":"You are a helpful assistant."}]

user_buffer = ""  # This should be filled in by your STT callback as before

while True:
    if user_buffer:
        full_conversation = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages]) + f"\nUser: {user_buffer}"
        messages.append({"role": "user", "content": user_buffer})
        user_buffer = ""  # clear after using

        completion = client.chat.completions.create(
            stream=True,
            messages=messages,
            max_tokens=4096,
            temperature=1.0,
            top_p=1.0,
            model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_ID"),
        )

        collected_messages = []
        last_tts_request = None

        for chunk in completion:
            if chunk.choices and hasattr(chunk.choices[0].delta, "content"):
                chunk_text = chunk.choices[0].delta.content
                if chunk_text:
                    collected_messages.append(chunk_text)
                    print(chunk_text, end="", flush=True)
                    # Check for sentence end to stream to TTS
                    if chunk_text in TTS_ENDS:
                        text = "".join(collected_messages).strip()
                        last_tts_request = assistant_speak(text)
                        collected_messages.clear()
        print()  # finish line after streaming LLM response

    time.sleep(0.1)

2025-06-17 12:14:09,024 - micro - MainProcess - INFO     Starting recognition from byte stream... (speech_recognizer.py:start:262)
INFO:micro:Starting recognition from byte stream...
2025-06-17 12:14:09,034 - micro - MainProcess - INFO     Recognition started. (speech_recognizer.py:start:320)
INFO:micro:Recognition started.


🎙️ Speak now...
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): hi there can you
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): hi there can you describe
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): hi there can you describe madrid
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): hi there can you describe madrid uh in 10
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): hi there can you describe madrid in 10
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): hi there can you describe madrid in 10 wars
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): hi there can you describe madrid in 10 wars and
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): hi there can you describe madrid in 10 wars and please i
Hi there, I am a partial result callba

KeyboardInterrupt: 

Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): vibrant art
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): vibrant art delicious
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): vibrant art delicious cuisine
🧾 User (final): Vibrant art, Delicious cuisine.
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): spanish
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): spanish asterisco asterisco
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): spanish asterisco asterisco ciudad
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): spanish asterisco asterisco ciudad animada
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): spanish asterisco asterisco ciudad animada rica
Hi there, I am a partial result callback!
Syntetixing: False
🗣️ User (partial): spanis