Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion mod_openai_audio_stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,9 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_openai_audio_stream_load)
if (switch_event_reserve_subclass(EVENT_JSON) != SWITCH_STATUS_SUCCESS ||
switch_event_reserve_subclass(EVENT_CONNECT) != SWITCH_STATUS_SUCCESS ||
switch_event_reserve_subclass(EVENT_ERROR) != SWITCH_STATUS_SUCCESS ||
switch_event_reserve_subclass(EVENT_DISCONNECT) != SWITCH_STATUS_SUCCESS) {
switch_event_reserve_subclass(EVENT_DISCONNECT) != SWITCH_STATUS_SUCCESS ||
switch_event_reserve_subclass(EVENT_OPENAI_SPEECH_STARTED) != SWITCH_STATUS_SUCCESS ||
switch_event_reserve_subclass(EVENT_OPENAI_SPEECH_STOPPED) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register an event subclass for mod_openai_audio_stream API.\n");
return SWITCH_STATUS_TERM;
}
Expand All @@ -285,6 +287,8 @@ SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_openai_audio_stream_shutdown)
switch_event_free_subclass(EVENT_CONNECT);
switch_event_free_subclass(EVENT_DISCONNECT);
switch_event_free_subclass(EVENT_ERROR);
switch_event_free_subclass(EVENT_OPENAI_SPEECH_STARTED);
switch_event_free_subclass(EVENT_OPENAI_SPEECH_STOPPED);

return SWITCH_STATUS_SUCCESS;
}
12 changes: 7 additions & 5 deletions mod_openai_audio_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
#define MAX_SESSION_ID (256)
#define MAX_WS_URI (4096)

#define EVENT_CONNECT "mod_openai_audio_stream::connect"
#define EVENT_DISCONNECT "mod_openai_audio_stream::disconnect"
#define EVENT_ERROR "mod_openai_audio_stream::error"
#define EVENT_JSON "mod_openai_audio_stream::json"
#define EVENT_PLAY "mod_openai_audio_stream::play"
#define EVENT_CONNECT "mod_openai_audio_stream::connect"
#define EVENT_DISCONNECT "mod_openai_audio_stream::disconnect"
#define EVENT_ERROR "mod_openai_audio_stream::error"
#define EVENT_JSON "mod_openai_audio_stream::json"
#define EVENT_PLAY "mod_openai_audio_stream::play"
#define EVENT_OPENAI_SPEECH_STARTED "mod_openai_audio_stream::openai_speech_start"
#define EVENT_OPENAI_SPEECH_STOPPED "mod_openai_audio_stream::openai_speech_stop"

typedef void (*responseHandler_t)(switch_core_session_t* session, const char* eventName, const char* json);

Expand Down
46 changes: 44 additions & 2 deletions openai_audio_streamer_glue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ class AudioStreamer {
} else if(jsType && strcmp(jsType, "response.audio.delta") == 0) {
const char* jsonAudio = cJSON_GetObjectCstr(json, "delta");
playback_clear_requested = false;
m_response_audio_done = false;

if(jsonAudio && strlen(jsonAudio) > 0) {
std::string rawAudio;
Expand Down Expand Up @@ -336,7 +337,10 @@ class AudioStreamer {
} else {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%s) processMessage - response.audio.delta no audio data\n", m_sessionId.c_str());
}
}
} else if(jsType && strcmp(jsType, "response.audio.done") == 0) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%s) processMessage - audio done\n", m_sessionId.c_str());
m_response_audio_done = true;
}
cJSON_Delete(json);
return status;
}
Expand All @@ -351,6 +355,7 @@ class AudioStreamer {
void push_audio_queue(const std::vector<int16_t>& audio_data) {
std::lock_guard<std::mutex> lock(m_audio_queue_mutex);
m_audio_queue.push(audio_data);
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) audio queue size: %zu\n", m_sessionId.c_str(), m_audio_queue.size());
}

std::vector<int16_t> pop_audio_queue() {
Expand Down Expand Up @@ -429,6 +434,33 @@ class AudioStreamer {
return playback_clear_requested;
}

bool is_openai_speaking() {
return m_openai_speaking;
}

bool is_response_audio_done() {
return m_response_audio_done;
}

void openai_speech_started() {
m_openai_speaking = true;
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) Openai started speaking\n", m_sessionId.c_str());
const char *payload = "{\"status\":\"started\"}";
m_notify(psession, EVENT_OPENAI_SPEECH_STARTED, payload);
switch_core_session_rwunlock(psession);
}

void openai_speech_stopped() {
m_openai_speaking = false;
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) Openai stopped speaking\n", m_sessionId.c_str());

const char *payload = "{\"status\":\"stopped\"}";
m_notify(psession, EVENT_OPENAI_SPEECH_STOPPED, payload);
switch_core_session_rwunlock(psession);
}


private:
std::string m_sessionId;
Expand All @@ -446,6 +478,8 @@ class AudioStreamer {
std::mutex m_audio_queue_mutex;
bool playback_clear_requested = false;
bool m_disable_audiofiles = false; // disable saving audio files if true
bool m_openai_speaking = false;
bool m_response_audio_done = false;
};


Expand Down Expand Up @@ -983,11 +1017,16 @@ extern "C" {

if (as->clear_requested()) {
switch_buffer_zero(tech_pvt->playback_buffer);
inuse = 0;
Copy link

Copilot AI Oct 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Setting inuse = 0 after clearing the buffer could cause incorrect behavior. The inuse variable represents the actual bytes in the buffer, and after switch_buffer_zero() it should reflect the buffer's empty state, but this assignment might interfere with the subsequent logic that depends on the actual buffer size.

Suggested change
inuse = 0;
inuse = switch_buffer_inuse(tech_pvt->playback_buffer);

Copilot uses AI. Check for mistakes.
}
if (inuse < bytes_needed * 2 && !as->is_audio_queue_empty()) {
auto chunk = as->pop_audio_queue();
switch_buffer_write(tech_pvt->playback_buffer, chunk.data(), chunk.size() * sizeof(int16_t));
} else if (inuse == 0) {
// Openai just finished speaking for interruption or end of response
if(as->is_openai_speaking() && as->is_response_audio_done()) {
Copy link

Copilot AI Oct 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The condition combines two state checks that could be confusing. Consider adding a comment explaining when both conditions are true simultaneously, or create a helper method like should_stop_speech() to encapsulate this logic.

Copilot uses AI. Check for mistakes.
as->openai_speech_stopped();
}
return SWITCH_TRUE;
}

Expand All @@ -999,6 +1038,10 @@ extern "C" {
switch_buffer_read(tech_pvt->playback_buffer, data, inuse);
}

if (!as->is_openai_speaking()) {
as->openai_speech_started();
}

frame->datalen = inuse > bytes_needed ? bytes_needed : inuse;
frame->samples = frame->datalen / bytes_per_sample;

Expand Down Expand Up @@ -1043,4 +1086,3 @@ extern "C" {
return SWITCH_STATUS_FALSE;
}
}