From 57e83249f85e153ad1bf726d37156c2dd47400ea Mon Sep 17 00:00:00 2001 From: Dario Pellegrino Date: Wed, 8 Oct 2025 11:58:07 +0200 Subject: [PATCH] add openai speech events Signed-off-by: Dario Pellegrino --- mod_openai_audio_stream.c | 6 ++++- mod_openai_audio_stream.h | 12 +++++---- openai_audio_streamer_glue.cpp | 46 ++++++++++++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/mod_openai_audio_stream.c b/mod_openai_audio_stream.c index fd6a246..da6dd02 100644 --- a/mod_openai_audio_stream.c +++ b/mod_openai_audio_stream.c @@ -259,7 +259,9 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_openai_audio_stream_load) if (switch_event_reserve_subclass(EVENT_JSON) != SWITCH_STATUS_SUCCESS || switch_event_reserve_subclass(EVENT_CONNECT) != SWITCH_STATUS_SUCCESS || switch_event_reserve_subclass(EVENT_ERROR) != SWITCH_STATUS_SUCCESS || - switch_event_reserve_subclass(EVENT_DISCONNECT) != SWITCH_STATUS_SUCCESS) { + switch_event_reserve_subclass(EVENT_DISCONNECT) != SWITCH_STATUS_SUCCESS || + switch_event_reserve_subclass(EVENT_OPENAI_SPEECH_STARTED) != SWITCH_STATUS_SUCCESS || + switch_event_reserve_subclass(EVENT_OPENAI_SPEECH_STOPPED) != SWITCH_STATUS_SUCCESS) { switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register an event subclass for mod_openai_audio_stream API.\n"); return SWITCH_STATUS_TERM; } @@ -285,6 +287,8 @@ SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_openai_audio_stream_shutdown) switch_event_free_subclass(EVENT_CONNECT); switch_event_free_subclass(EVENT_DISCONNECT); switch_event_free_subclass(EVENT_ERROR); + switch_event_free_subclass(EVENT_OPENAI_SPEECH_STARTED); + switch_event_free_subclass(EVENT_OPENAI_SPEECH_STOPPED); return SWITCH_STATUS_SUCCESS; } diff --git a/mod_openai_audio_stream.h b/mod_openai_audio_stream.h index e3b32ce..333e5af 100644 --- a/mod_openai_audio_stream.h +++ b/mod_openai_audio_stream.h @@ -10,11 +10,13 @@ #define MAX_SESSION_ID (256) #define MAX_WS_URI (4096) -#define EVENT_CONNECT "mod_openai_audio_stream::connect" -#define EVENT_DISCONNECT "mod_openai_audio_stream::disconnect" -#define EVENT_ERROR "mod_openai_audio_stream::error" -#define EVENT_JSON "mod_openai_audio_stream::json" -#define EVENT_PLAY "mod_openai_audio_stream::play" +#define EVENT_CONNECT "mod_openai_audio_stream::connect" +#define EVENT_DISCONNECT "mod_openai_audio_stream::disconnect" +#define EVENT_ERROR "mod_openai_audio_stream::error" +#define EVENT_JSON "mod_openai_audio_stream::json" +#define EVENT_PLAY "mod_openai_audio_stream::play" +#define EVENT_OPENAI_SPEECH_STARTED "mod_openai_audio_stream::openai_speech_start" +#define EVENT_OPENAI_SPEECH_STOPPED "mod_openai_audio_stream::openai_speech_stop" typedef void (*responseHandler_t)(switch_core_session_t* session, const char* eventName, const char* json); diff --git a/openai_audio_streamer_glue.cpp b/openai_audio_streamer_glue.cpp index 490f11c..8465b66 100644 --- a/openai_audio_streamer_glue.cpp +++ b/openai_audio_streamer_glue.cpp @@ -298,6 +298,7 @@ class AudioStreamer { } else if(jsType && strcmp(jsType, "response.audio.delta") == 0) { const char* jsonAudio = cJSON_GetObjectCstr(json, "delta"); playback_clear_requested = false; + m_response_audio_done = false; if(jsonAudio && strlen(jsonAudio) > 0) { std::string rawAudio; @@ -336,7 +337,10 @@ class AudioStreamer { } else { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%s) processMessage - response.audio.delta no audio data\n", m_sessionId.c_str()); } - } + } else if(jsType && strcmp(jsType, "response.audio.done") == 0) { + switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%s) processMessage - audio done\n", m_sessionId.c_str()); + m_response_audio_done = true; + } cJSON_Delete(json); return status; } @@ -351,6 +355,7 @@ class AudioStreamer { void push_audio_queue(const std::vector& audio_data) { std::lock_guard lock(m_audio_queue_mutex); m_audio_queue.push(audio_data); + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) audio queue size: %zu\n", m_sessionId.c_str(), m_audio_queue.size()); } std::vector pop_audio_queue() { @@ -429,6 +434,33 @@ class AudioStreamer { return playback_clear_requested; } + bool is_openai_speaking() { + return m_openai_speaking; + } + + bool is_response_audio_done() { + return m_response_audio_done; + } + + void openai_speech_started() { + m_openai_speaking = true; + switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str()); + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) Openai started speaking\n", m_sessionId.c_str()); + const char *payload = "{\"status\":\"started\"}"; + m_notify(psession, EVENT_OPENAI_SPEECH_STARTED, payload); + switch_core_session_rwunlock(psession); + } + + void openai_speech_stopped() { + m_openai_speaking = false; + switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str()); + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) Openai stopped speaking\n", m_sessionId.c_str()); + + const char *payload = "{\"status\":\"stopped\"}"; + m_notify(psession, EVENT_OPENAI_SPEECH_STOPPED, payload); + switch_core_session_rwunlock(psession); + } + private: std::string m_sessionId; @@ -446,6 +478,8 @@ class AudioStreamer { std::mutex m_audio_queue_mutex; bool playback_clear_requested = false; bool m_disable_audiofiles = false; // disable saving audio files if true + bool m_openai_speaking = false; + bool m_response_audio_done = false; }; @@ -983,11 +1017,16 @@ extern "C" { if (as->clear_requested()) { switch_buffer_zero(tech_pvt->playback_buffer); + inuse = 0; } if (inuse < bytes_needed * 2 && !as->is_audio_queue_empty()) { auto chunk = as->pop_audio_queue(); switch_buffer_write(tech_pvt->playback_buffer, chunk.data(), chunk.size() * sizeof(int16_t)); } else if (inuse == 0) { + // Openai just finished speaking for interruption or end of response + if(as->is_openai_speaking() && as->is_response_audio_done()) { + as->openai_speech_stopped(); + } return SWITCH_TRUE; } @@ -999,6 +1038,10 @@ extern "C" { switch_buffer_read(tech_pvt->playback_buffer, data, inuse); } + if (!as->is_openai_speaking()) { + as->openai_speech_started(); + } + frame->datalen = inuse > bytes_needed ? bytes_needed : inuse; frame->samples = frame->datalen / bytes_per_sample; @@ -1043,4 +1086,3 @@ extern "C" { return SWITCH_STATUS_FALSE; } } -