In [1]:
import logging
import os

# set the directory to the location of the script
try:
    os.chdir("../..")
    target_directory = os.getenv(
        "TARGET_DIRECTORY", os.getcwd()
    )  # Use environment variable if available
    if os.path.exists(target_directory):
        os.chdir(target_directory)
        print(f"Changed directory to: {os.getcwd()}")
        logging.info(f"Successfully changed directory to: {os.getcwd()}")
    else:
        logging.error(f"Directory does not exist: {target_directory}")
except Exception as e:
    logging.exception(f"An error occurred while changing directory: {e}")

Changed directory to: c:\Users\pablosal\Desktop\gbb-ai-audio-agent


## Azure AI Speech (VAD)

In [None]:
import time, threading, logging, os
from typing import Optional, Iterator
import numpy as np, pyaudio, torch

from src.speech.text_to_speech import SpeechSynthesizer
from src.speech.speech_recognizer import StreamingSpeechRecognizerFromBytes
from openai import AzureOpenAI

logger = logging.getLogger("nb.voice")
if not logger.handlers:
    h = logging.StreamHandler()
    h.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
    logger.addHandler(h)
logger.setLevel(logging.INFO)

# ---------------- Silero VAD setup ----------------
try:
    from silero_vad import load_silero_vad, VADIterator
except Exception as e:
    raise RuntimeError("pip install silero-vad torch") from e

def int2float(x: np.ndarray) -> np.ndarray:
    y = x.astype("float32")
    y *= 1.0 / 32768.0
    return y

class PCMFramer:
    def __init__(self, frame_bytes: int = 640) -> None:  # 20ms @ 16k mono PCM16
        self.buf = bytearray()
        self.n = frame_bytes
    def feed(self, chunk: bytes) -> Iterator[bytes]:
        if not chunk:
            return iter(())
        self.buf.extend(chunk)
        out = []
        while len(self.buf) >= self.n:
            out.append(bytes(self.buf[: self.n]))
            del self.buf[: self.n]
        return iter(out)

# ---------------- Globals for re-entrant notebook runs ----------------
STATE = {
    "stop": threading.Event(),
    "mic_thread": None,
    "audio": None,
    "stream": None,
    "recognizer": None,
    "tts": None,
    "client": None,
    "vad": None,
    "framer": PCMFramer(1024),  # keep CHUNK=1024 for notebook compatibility
    "user_buf": "",
    "is_synth": False,
    "vad_started": False,
    "vad_trigs": 0,
    "barge_latch": False,
    "last_barge_ms": 0,
    "pending_tts": [],
}

# ---- tuning toggles ----
VAD_START_N = 2                 # need 2×20ms frames to confirm start
VAD_THRESH = 1                 # raise toward 0.7 if noisy; lower toward 0.58 if too slow
VAD_END_SIL_MS = 350           # min silence for end-of-speech
PARTIAL_FALLBACK = False        # set True after you confirm VAD is solid
BARGE_DEBOUNCE_MS = 250         # don’t spam stop_speaking()
COOLDOWN_AFTER_BARGE_MS = 350   # don’t feed STT for this long after barge-in (absorbs TTS tail)
TTS_ENDS = {".", "!", "?", ";", "\n"}

# ---------------- helpers ----------------
def _safe_stop_tts():
    now = int(time.time() * 1000)
    if now - STATE["last_barge_ms"] < BARGE_DEBOUNCE_MS:
        return
    STATE["last_barge_ms"] = now
    if STATE["is_synth"]:
        try:
            STATE["tts"].stop_speaking()
            logger.info("TTS stopped (barge-in)")
        except Exception as e:
            logger.warning("stop_speaking failed: %s", e)
        STATE["is_synth"] = False
    STATE["barge_latch"] = True
    # start a short cooldown window where we mute mic->STT
    STATE["cooldown_until_ms"] = now + COOLDOWN_AFTER_BARGE_MS

def _on_partial(text: str, lang: str):
    if PARTIAL_FALLBACK and not STATE["barge_latch"] and STATE["is_synth"]:
        _safe_stop_tts()
        logger.info("Barge-in via STT partial (fallback)")
    # optional: print partials
    # logger.debug("partial[%s]: %s", lang, text)

def _on_final(text: str, lang: str):
    STATE["user_buf"] += text.strip() + "\n"
    logger.info("final[%s]: %s", lang, text)

def _assistant_speak(text: str):
    if not text:
        return
    # half-duplex: if user talking or we just barged-in, queue this
    if STATE["vad_started"] or STATE["barge_latch"]:
        STATE["pending_tts"].append(text)
        logger.info("queued TTS (%d chars)", len(text))
        return
    STATE["is_synth"] = True
    STATE["tts"].start_speaking_text(text)
    logger.info("speaking now: %.60r", text)

def _flush_tts_queue_if_idle():
    if STATE["vad_started"] or STATE["barge_latch"] or STATE["is_synth"]:
        return
    while STATE["pending_tts"]:
        text = STATE["pending_tts"].pop(0)
        STATE["is_synth"] = True
        STATE["tts"].start_speaking_text(text)
        logger.info("flushing queued: %.60r", text)

# ---------------- mic thread ----------------
def _mic_loop():
    RATE, CHANNELS, CHUNK = 16000, 1, 1024  # keep your CHUNK
    audio = pyaudio.PyAudio()
    stream = audio.open(format=pyaudio.paInt16, channels=CHANNELS, rate=RATE,
                        input=True, frames_per_buffer=CHUNK)
    STATE["audio"], STATE["stream"] = audio, stream

    framer = STATE["framer"]
    vad = STATE["vad"]
    cooldown_until = 0

    while not STATE["stop"].is_set():
        try:
            data = stream.read(CHUNK, exception_on_overflow=False)

            now = int(time.time() * 1000)
            push_ok = True

            # Half-duplex: if TTS is active and we haven't barged-in, mute STT feed
            if STATE["is_synth"] and not STATE["barge_latch"] and not STATE["vad_started"]:
                push_ok = False

            # Short cooldown after barge-in to absorb TTS tail
            if "cooldown_until_ms" in STATE and now < STATE["cooldown_until_ms"]:
                push_ok = False

            if push_ok and STATE["recognizer"] is not None:
                STATE["recognizer"].write_bytes(data)

            # VAD on 20ms frames
            for f in framer.feed(data):
                x = torch.from_numpy(int2float(np.frombuffer(f, dtype=np.int16)))
                seg = vad(x)  # updates .triggered internally

                if getattr(vad, "triggered", False) and not STATE["vad_started"]:
                    STATE["vad_trigs"] += 1
                    if STATE["vad_trigs"] >= VAD_START_N and not STATE["barge_latch"]:
                        STATE["vad_started"] = True
                        STATE["vad_trigs"] = 0
                        _safe_stop_tts()
                        logger.info("VAD start")
                elif not getattr(vad, "triggered", False):
                    STATE["vad_trigs"] = 0

                # END: when iterator returns a chunk, that marks end-of-speech
                if seg is not None and STATE["vad_started"]:
                    STATE["vad_started"] = False
                    STATE["barge_latch"] = False   # allow TTS again
                    logger.info("VAD end")
                    _flush_tts_queue_if_idle()

        except Exception as e:
            logger.warning("mic loop err: %s", e)
            time.sleep(0.01)

    # cleanup
    try:
        stream.stop_stream(); stream.close(); audio.terminate()
    except Exception:
        pass
    STATE["audio"], STATE["stream"] = None, None

# ---------------- public API for the notebook ----------------
def start_pipeline():
    """Idempotent start; safe to re-run in Jupyter."""
    # stop prior run if alive
    if STATE["mic_thread"] and STATE["mic_thread"].is_alive():
        stop_pipeline()

    # Azure clients
    recognizer = StreamingSpeechRecognizerFromBytes(
        use_semantic_segmentation=False,
        vad_silence_timeout_ms=800,
        audio_format="pcm",
        candidate_languages=["en-US", "fr-FR", "de-DE", "es-ES", "it-IT"],
    )
    recognizer.set_partial_result_callback(_on_partial)
    recognizer.set_final_result_callback(_on_final)
    recognizer.start()
    STATE["recognizer"] = recognizer

    tts = SpeechSynthesizer(voice="en-US-Ava:DragonHDLatestNeural", playback="always")
    STATE["tts"] = tts

    # Silero VAD
    model = load_silero_vad()
    STATE["vad"] = VADIterator(
        model, threshold=VAD_THRESH, sampling_rate=16000,
        min_silence_duration_ms=VAD_END_SIL_MS, speech_pad_ms=120
    )

    # reset state latches
    STATE["stop"].clear()
    STATE["user_buf"] = ""
    STATE["is_synth"] = False
    STATE["vad_started"] = False
    STATE["vad_trigs"] = 0
    STATE["barge_latch"] = False
    STATE["last_barge_ms"] = 0
    STATE["pending_tts"].clear()
    STATE["cooldown_until_ms"] = 0

    # start mic thread
    th = threading.Thread(target=_mic_loop, daemon=True)
    STATE["mic_thread"] = th
    th.start()
    logger.info("🎙️ Pipeline started. Speak now...")

def stop_pipeline():
    """Stop mic thread, STT, and TTS cleanly."""
    try:
        STATE["stop"].set()
        if STATE["mic_thread"]:
            STATE["mic_thread"].join(timeout=1.5)
    except Exception:
        pass
    try:
        if STATE["recognizer"]:
            STATE["recognizer"].stop()
    except Exception:
        pass
    try:
        if STATE["tts"]:
            STATE["tts"].stop_speaking()
    except Exception:
        pass
    STATE["mic_thread"] = None
    logger.info("Pipeline stopped.")

# ---------------- LLM loop example (kept simple) ----------------
def llm_step_and_tts(client: AzureOpenAI, messages: list[dict]):
    """Call AOAI streaming, speak sentence-by-sentence with half-duplex rules."""
    collected = []
    for chunk in client.chat.completions.create(
        stream=True,
        messages=messages,
        max_tokens=4096,
        temperature=1.0,
        top_p=1.0,
        model=os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_ID"),
    ):
        if chunk.choices and hasattr(chunk.choices[0].delta, "content"):
            piece = chunk.choices[0].delta.content
            if not piece:
                continue
            collected.append(piece)
            if piece in TTS_ENDS:
                _assistant_speak("".join(collected).strip())
                collected.clear()
    if collected:
        _assistant_speak("".join(collected).strip())


: 

In [6]:
client = AzureOpenAI(
    api_version="2025-02-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_KEY"),
)
messages = [{"role": "system", "content": "You are a helpful assistant."}]

start_pipeline()
try:
    while True:
        if STATE["user_buf"]:
            messages.append({"role": "user", "content": STATE["user_buf"]})
            STATE["user_buf"] = ""
            llm_step_and_tts(client, messages)
        time.sleep(0.05)
except KeyboardInterrupt:
    pass
finally:
    stop_pipeline()


[2025-08-11 16:57:05,033] INFO - micro: Azure Monitor tracing initialized for speech recognizer
INFO:micro:Azure Monitor tracing initialized for speech recognizer


[2025-08-11 16:57:05,049] INFO - micro: Creating SpeechConfig with API key authentication
INFO:micro:Creating SpeechConfig with API key authentication
[2025-08-11 16:57:05,064] INFO - micro: Starting recognition from byte stream…
INFO:micro:Starting recognition from byte stream…
[2025-08-11 16:57:05,083] INFO - micro: Speech-SDK prepare_start – format=pcm  neuralFE=False  diar=True
INFO:micro:Speech-SDK prepare_start – format=pcm  neuralFE=False  diar=True
[2025-08-11 16:57:05,103] INFO - micro: Speech-SDK ready (neuralFE=False, diarisation=True, speakers=2)
INFO:micro:Speech-SDK ready (neuralFE=False, diarisation=True, speakers=2)
[2025-08-11 16:57:05,124] INFO - micro: Recognition started.
INFO:micro:Recognition started.
[2025-08-11 16:57:05,146] INFO - micro: Azure Monitor tracing initialized for speech synthesizer
INFO:micro:Azure Monitor tracing initialized for speech synthesizer
[2025-08-11 16:57:05,164] INFO - micro: Creating SpeechConfig with API key authentication
INFO:micro:C