Skip to content
24 changes: 19 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,19 +131,20 @@ Defaults to `false`, which enforces hostname match with the peer certificate.
The freeswitch module exposes the following API commands:

```
uuid_openai_audio_stream <uuid> start <wss-url> <mix-type> <sampling-rate>
uuid_openai_audio_stream <uuid> start <wss-url> <mix-type> [<sampling-rate>] [mute_user]
```
Attaches a media bug and starts streaming audio (in L16 format) to the websocket server. FS default is 8k. If sampling-rate is other than 8k it will be resampled.
Attaches a media bug and starts streaming audio (in L16 format) to the websocket server. FS default is 8k. If sampling-rate is other than 8k it will be resampled. Passing `mute_user` delays forwarding caller audio to the Realtime API until you explicitly unmute.
- `uuid` - Freeswitch channel unique id
- `wss-url` - websocket url `ws://` or `wss://`
- `mix-type` - choice of
- "mono" - single channel containing caller's audio
- "mixed" - single channel containing both caller and callee audio
- "stereo" - two channels with caller audio in one and callee audio in the other.
- `sampling-rate` - choice of
- `sampling-rate` - optional, choice of
- "8k" = 8000 Hz sample rate will be generated
- "16k" = 16000 Hz sample rate will be generated
- "24k" = 24000 Hz sample rate will be generated
- `mute_user` - optional flag. When present, the module initialises muted and ignores caller audio until an explicit `unmute`.
- **IMPORTANT NOTE**: The OpenAI Realtime API, when using PCM audio format, expects the audio to be in 24 kHz sample rate. Use the sampling-rate parameter as `24k` (or `24000`) and mono to ensure that the audio is sent in the correct format. From the OpenAI Realtime API documentation: *input audio must be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian byte order.* Support for exchanging audio with OpenAI in other formats may be developed in the future, which would make the `<sampling-rate>` and `<mono<` parameters useful for controlling the output format dynamically.

```
Expand All @@ -158,12 +159,25 @@ uuid_openai_audio_stream <uuid> stop
```
uuid_openai_audio_stream <uuid> pause
```
Pauses audio stream
Pauses audio streaming in both directions. Caller audio stops flowing to OpenAI and any OpenAI playback currently buffering into the channel is halted until `resume`.

```
uuid_openai_audio_stream <uuid> resume
```
Resumes audio stream
Resumes audio streaming in both directions after a `pause`.

```
uuid_openai_audio_stream <uuid> mute [user | openai | all]
```
Keeps the media bug alive while silencing the selected leg. Defaults to `user` when omitted.
- `user`: block caller audio being sent to OpenAI.
- `openai`: block OpenAI playback from reaching the channel.
- `all`: apply both mute operations at once.

```
uuid_openai_audio_stream <uuid> unmute [user | openai | all]
```
Re-enables the selected audio leg after a corresponding `mute`. Defaults to `user` when omitted.

## Events
Module will generate the following event types:
Expand Down
203 changes: 160 additions & 43 deletions mod_openai_audio_stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data,
static switch_status_t start_capture(switch_core_session_t *session,
switch_media_bug_flag_t flags,
char* wsUri,
int sampling)
int sampling,
switch_bool_t start_muted)
{
switch_channel_t *channel = switch_core_session_get_channel(session);
switch_media_bug_t *bug;
Expand All @@ -87,7 +88,7 @@ static switch_status_t start_capture(switch_core_session_t *session,

switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "calling stream_session_init.\n");
if (SWITCH_STATUS_FALSE == stream_session_init(session, responseHandler, read_codec->implementation->actual_samples_per_second,
wsUri, sampling, channels, &pUserData)) {
wsUri, sampling, channels, start_muted, &pUserData)) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing mod_openai_audio_stream session.\n");
return SWITCH_STATUS_FALSE;
}
Expand Down Expand Up @@ -127,6 +128,31 @@ static switch_status_t do_pauseresume(switch_core_session_t *session, int pause)
return status;
}

static switch_status_t do_audio_mute(switch_core_session_t *session, const char *target, int mute)
{
switch_status_t status = SWITCH_STATUS_FALSE;
const char *which = target && *target ? target : "user";

switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO,
"mod_openai_audio_stream: %s %s audio\n", mute ? "mute" : "unmute", which);

if (!strcasecmp(which, "user")) {
status = stream_session_set_user_mute(session, mute);
} else if (!strcasecmp(which, "openai")) {
status = stream_session_set_openai_mute(session, mute);
} else if (!strcasecmp(which, "all") || !strcasecmp(which, "both")) {
switch_status_t user_status = stream_session_set_user_mute(session, mute);
switch_status_t openai_status = stream_session_set_openai_mute(session, mute);
status = (user_status == SWITCH_STATUS_SUCCESS && openai_status == SWITCH_STATUS_SUCCESS) ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE;
} else {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
"mod_openai_audio_stream: invalid mute target '%s', expected user|openai|all\n", which);
status = SWITCH_STATUS_FALSE;
}

return status;
}

static switch_status_t send_json(switch_core_session_t *session, char* json) {
switch_status_t status = SWITCH_STATUS_FALSE;
switch_channel_t *channel = switch_core_session_get_channel(session);
Expand All @@ -141,10 +167,60 @@ static switch_status_t send_json(switch_core_session_t *session, char* json) {
return status;
}

#define STREAM_API_SYNTAX "<uuid> [start | stop | send_json | pause | resume | graceful-shutdown ] [wss-url | path] [mono | mixed | stereo] [8000 | 16000 | 24000]"
#define STREAM_API_SYNTAX \
"USAGE:\n" \
"--------------------------------------------------------------------------------\n" \
"uuid_openai_audio_stream <uuid> [start | stop | send_json | pause | resume |\n" \
" mute | unmute]\n" \
" [wss-url | path | user | openai | all | base64json]\n" \
" [mono | mixed | stereo]\n" \
" [8000 | 16000 | 24000]\n" \
" [mute_user]\n" \
"--------------------------------------------------------------------------------\n"

typedef enum {
STREAM_CMD_UNKNOWN,
STREAM_CMD_START,
STREAM_CMD_STOP,
STREAM_CMD_SEND_JSON,
STREAM_CMD_PAUSE,
STREAM_CMD_RESUME,
STREAM_CMD_MUTE,
STREAM_CMD_UNMUTE
} stream_command_t;

static stream_command_t stream_command_from_string(const char *name)
{
if (zstr(name)) {
return STREAM_CMD_UNKNOWN;
}
if (!strcasecmp(name, "start")) {
return STREAM_CMD_START;
}
if (!strcasecmp(name, "stop")) {
return STREAM_CMD_STOP;
}
if (!strcasecmp(name, "send_json")) {
return STREAM_CMD_SEND_JSON;
}
if (!strcasecmp(name, "pause")) {
return STREAM_CMD_PAUSE;
}
if (!strcasecmp(name, "resume")) {
return STREAM_CMD_RESUME;
}
if (!strcasecmp(name, "mute")) {
return STREAM_CMD_MUTE;
}
if (!strcasecmp(name, "unmute")) {
return STREAM_CMD_UNMUTE;
}
return STREAM_CMD_UNKNOWN;
}

SWITCH_STANDARD_API(stream_function)
{
char *mycmd = NULL, *argv[6] = { 0 };
char *mycmd = NULL, *argv[8] = { 0 };
int argc = 0;

switch_status_t status = SWITCH_STATUS_FALSE;
Expand All @@ -154,40 +230,55 @@ SWITCH_STANDARD_API(stream_function)
}
assert(cmd);

if (zstr(cmd) || argc < 2 || (0 == strcmp(argv[1], "start") && argc < 4)) {
if (zstr(cmd) || argc < 2) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s.\n", cmd);
stream->write_function(stream, "-USAGE: %s\n", STREAM_API_SYNTAX);
stream->write_function(stream, "%s\n", STREAM_API_SYNTAX);
goto done;
} else {
if (strcasecmp(argv[1], "send_json")) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "mod_openai_audio_stream cmd: %s\n", cmd ? cmd : "");
}
switch_core_session_t *lsession = NULL;
if ((lsession = switch_core_session_locate(argv[0]))) {
if (!strcasecmp(argv[1], "stop")) {
if(argc > 2 && (is_valid_utf8(argv[2]) != SWITCH_STATUS_SUCCESS)) {
}

stream_command_t command = stream_command_from_string(argv[1]);

if (command != STREAM_CMD_SEND_JSON) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "mod_openai_audio_stream cmd: %s\n", cmd ? cmd : "");
}

switch_core_session_t *lsession = NULL;
if ((lsession = switch_core_session_locate(argv[0]))) {
switch (command) {
case STREAM_CMD_STOP:
if (argc > 2 && (is_valid_utf8(argv[2]) != SWITCH_STATUS_SUCCESS)) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
"%s contains invalid utf8 characters\n", argv[2]);
switch_core_session_rwunlock(lsession);
goto done;
goto release_session;
}
status = do_stop(lsession, argc > 2 ? argv[2] : NULL);
} else if (!strcasecmp(argv[1], "pause")) {
break;
case STREAM_CMD_PAUSE:
status = do_pauseresume(lsession, 1);
} else if (!strcasecmp(argv[1], "resume")) {
break;
case STREAM_CMD_RESUME:
status = do_pauseresume(lsession, 0);
} else if (!strcasecmp(argv[1], "send_json")) {
break;
case STREAM_CMD_SEND_JSON:
if (argc < 3) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
"send_json requires an argument specifying json to send\n");
switch_core_session_rwunlock(lsession);
goto done;
goto release_session;
}
status = send_json(lsession, argv[2]);
} else if (!strcasecmp(argv[1], "start")) {
//switch_channel_t *channel = switch_core_session_get_channel(lsession);
break;
case STREAM_CMD_START:
{
if (argc < 4) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
"Error with command %s.\n", cmd);
stream->write_function(stream, "%s\n", STREAM_API_SYNTAX);
goto release_session;
}
char wsUri[MAX_WS_URI];
int sampling = 8000;
const char *sampling_str = NULL;
switch_bool_t start_muted = SWITCH_FALSE;
switch_media_bug_flag_t flags = SMBF_READ_STREAM;
flags |= SMBF_WRITE_REPLACE;
if (0 == strcmp(argv[3], "mixed")) {
Expand All @@ -198,41 +289,65 @@ SWITCH_STANDARD_API(stream_function)
} else if (0 != strcmp(argv[3], "mono")) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
"invalid mix type: %s, must be mono, mixed, or stereo\n", argv[3]);
switch_core_session_rwunlock(lsession);
goto done;
goto release_session;
}
if (argc > 4) {
if (0 == strcmp(argv[4], "16k")) {
sampling = 16000;
} else if (0 == strcmp(argv[4], "8k")) {
sampling = 8000;
} else if (0 == strcmp(argv[4], "24k")) {
sampling = 24000;
int next_index = 4;
if (!strcasecmp(argv[next_index], "mute_user")) {
start_muted = SWITCH_TRUE;
} else {
sampling = atoi(argv[4]);
sampling_str = argv[next_index];
if (0 == strcmp(sampling_str, "16k")) {
sampling = 16000;
} else if (0 == strcmp(sampling_str, "8k")) {
sampling = 8000;
} else if (0 == strcmp(sampling_str, "24k")) {
sampling = 24000;
} else {
sampling = atoi(sampling_str);
}
next_index++;
if (argc > next_index && !strcasecmp(argv[next_index], "mute_user")) {
start_muted = SWITCH_TRUE;
}
}
}

if (!validate_ws_uri(argv[2], &wsUri[0])) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
"invalid websocket uri: %s\n", argv[2]);
} else if (sampling % 8000 != 0) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
"invalid sample rate: %s\n", argv[4]);
if (sampling_str) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
"invalid sample rate: %s\n", sampling_str);
} else {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
"invalid sample rate: %d\n", sampling);
}
} else {
status = start_capture(lsession, flags, wsUri, sampling);
status = start_capture(lsession, flags, wsUri, sampling, start_muted);
}
} else {
break;
}
case STREAM_CMD_MUTE:
case STREAM_CMD_UNMUTE:
{
const char *target = (argc > 2) ? argv[2] : "user";
status = do_audio_mute(lsession, target, command == STREAM_CMD_MUTE ? 1 : 0);
break;
}
case STREAM_CMD_UNKNOWN:
default:
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
"unsupported mod_openai_audio_stream cmd: %s\n", argv[1]);
}


switch_core_session_rwunlock(lsession);
} else {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error locating session %s\n",
argv[0]);
break;
}

release_session:
switch_core_session_rwunlock(lsession);
} else {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error locating session %s\n",
argv[0]);
}

if (status == SWITCH_STATUS_SUCCESS) {
Expand Down Expand Up @@ -270,6 +385,8 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_openai_audio_stream_load)
switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid stop");
switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid pause");
switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid resume");
switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid mute");
switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid unmute");
switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid send_json");

switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_openai_audio_stream API successfully loaded\n");
Expand Down
3 changes: 2 additions & 1 deletion mod_openai_audio_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,14 @@ struct private_data {
int sampling;
int channels;
int audio_paused:1;
int user_audio_muted:1;
int openai_audio_muted:1;
int close_requested:1;
RingBuffer *buffer;
switch_buffer_t *sbuffer;
uint8_t *data;
int rtp_packets;
switch_buffer_t *playback_buffer;
switch_mutex_t *playback_mutex;
};

typedef struct private_data private_t;
Expand Down
Loading