VoiSmart · dariopellegrino00 · Nov 3, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/README.md b/README.md
@@ -131,19 +131,20 @@ Defaults to `false`, which enforces hostname match with the peer certificate.
 The freeswitch module exposes the following API commands:
 
 ```
-uuid_openai_audio_stream <uuid> start <wss-url> <mix-type> <sampling-rate> 
+uuid_openai_audio_stream <uuid> start <wss-url> <mix-type> [<sampling-rate>] [mute_user]
 ```
-Attaches a media bug and starts streaming audio (in L16 format) to the websocket server. FS default is 8k. If sampling-rate is other than 8k it will be resampled.
+Attaches a media bug and starts streaming audio (in L16 format) to the websocket server. FS default is 8k. If sampling-rate is other than 8k it will be resampled. Passing `mute_user` delays forwarding caller audio to the Realtime API until you explicitly unmute.
 - `uuid` - Freeswitch channel unique id
 - `wss-url` - websocket url `ws://` or `wss://`
 - `mix-type` - choice of 
   - "mono" - single channel containing caller's audio
   - "mixed" - single channel containing both caller and callee audio
   - "stereo" - two channels with caller audio in one and callee audio in the other.
-- `sampling-rate` - choice of
+- `sampling-rate` - optional, choice of
   - "8k" = 8000 Hz sample rate will be generated
   - "16k" = 16000 Hz sample rate will be generated
   - "24k" = 24000 Hz sample rate will be generated
+- `mute_user` - optional flag. When present, the module initialises muted and ignores caller audio until an explicit `unmute`.
 - **IMPORTANT NOTE**: The OpenAI Realtime API, when using PCM audio format, expects the audio to be in 24 kHz sample rate. Use the sampling-rate parameter as `24k` (or `24000`) and mono to ensure that the audio is sent in the correct format. From the OpenAI Realtime API documentation: *input audio must be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian byte order.* Support for exchanging audio with OpenAI in other formats may be developed in the future, which would make the `<sampling-rate>` and `<mono<` parameters useful for controlling the output format dynamically.
 
 ```
@@ -158,12 +159,25 @@ uuid_openai_audio_stream <uuid> stop
 ```
 uuid_openai_audio_stream <uuid> pause
 ```
-Pauses audio stream
+Pauses audio streaming in both directions. Caller audio stops flowing to OpenAI and any OpenAI playback currently buffering into the channel is halted until `resume`.
 
 ```
 uuid_openai_audio_stream <uuid> resume
 ```
-Resumes audio stream
+Resumes audio streaming in both directions after a `pause`.
+
+```
+uuid_openai_audio_stream <uuid> mute [user | openai | all]
+```
+Keeps the media bug alive while silencing the selected leg. Defaults to `user` when omitted.
+- `user`: block caller audio being sent to OpenAI.
+- `openai`: block OpenAI playback from reaching the channel.
+- `all`: apply both mute operations at once.
+
+```
+uuid_openai_audio_stream <uuid> unmute [user | openai | all]
+```
+Re-enables the selected audio leg after a corresponding `mute`. Defaults to `user` when omitted.
 
 ## Events
 Module will generate the following event types:

diff --git a/mod_openai_audio_stream.c b/mod_openai_audio_stream.c
@@ -63,7 +63,8 @@ static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data,
 static switch_status_t start_capture(switch_core_session_t *session,
                                      switch_media_bug_flag_t flags,
                                      char* wsUri,
-                                     int sampling)
+                                     int sampling,
+                                     switch_bool_t start_muted)
 {
     switch_channel_t *channel = switch_core_session_get_channel(session);
     switch_media_bug_t *bug;
@@ -87,7 +88,7 @@ static switch_status_t start_capture(switch_core_session_t *session,
 
     switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "calling stream_session_init.\n");
     if (SWITCH_STATUS_FALSE == stream_session_init(session, responseHandler, read_codec->implementation->actual_samples_per_second,
-                                                 wsUri, sampling, channels, &pUserData)) {
+                                                 wsUri, sampling, channels, start_muted, &pUserData)) {
         switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error initializing mod_openai_audio_stream session.\n");
         return SWITCH_STATUS_FALSE;
     }
@@ -127,6 +128,31 @@ static switch_status_t do_pauseresume(switch_core_session_t *session, int pause)
     return status;
 }
 
+static switch_status_t do_audio_mute(switch_core_session_t *session, const char *target, int mute)
+{
+    switch_status_t status = SWITCH_STATUS_FALSE;
+    const char *which = target && *target ? target : "user";
+
+    switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO,
+                      "mod_openai_audio_stream: %s %s audio\n", mute ? "mute" : "unmute", which);
+
+    if (!strcasecmp(which, "user")) {
+        status = stream_session_set_user_mute(session, mute);
+    } else if (!strcasecmp(which, "openai")) {
+        status = stream_session_set_openai_mute(session, mute);
+    } else if (!strcasecmp(which, "all") || !strcasecmp(which, "both")) {
+        switch_status_t user_status = stream_session_set_user_mute(session, mute);
+        switch_status_t openai_status = stream_session_set_openai_mute(session, mute);
+        status = (user_status == SWITCH_STATUS_SUCCESS && openai_status == SWITCH_STATUS_SUCCESS) ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE;
+    } else {
+        switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
+                          "mod_openai_audio_stream: invalid mute target '%s', expected user|openai|all\n", which);
+        status = SWITCH_STATUS_FALSE;
+    }
+
+    return status;
+}
+
 static switch_status_t send_json(switch_core_session_t *session, char* json) {
     switch_status_t status = SWITCH_STATUS_FALSE;
     switch_channel_t *channel = switch_core_session_get_channel(session);
@@ -141,10 +167,60 @@ static switch_status_t send_json(switch_core_session_t *session, char* json) {
     return status;
 }
 
-#define STREAM_API_SYNTAX "<uuid> [start | stop | send_json | pause | resume | graceful-shutdown ] [wss-url | path] [mono | mixed | stereo] [8000 | 16000 | 24000]"
+#define STREAM_API_SYNTAX \
+"USAGE:\n" \
+"--------------------------------------------------------------------------------\n" \
+"uuid_openai_audio_stream <uuid> [start | stop | send_json | pause | resume |\n" \
+"                                 mute | unmute]\n" \
+"                                [wss-url | path | user | openai | all | base64json]\n" \
+"                                [mono | mixed | stereo]\n" \
+"                                [8000 | 16000 | 24000]\n" \
+"                                [mute_user]\n" \
+"--------------------------------------------------------------------------------\n"
+
+typedef enum {
+    STREAM_CMD_UNKNOWN,
+    STREAM_CMD_START,
+    STREAM_CMD_STOP,
+    STREAM_CMD_SEND_JSON,
+    STREAM_CMD_PAUSE,
+    STREAM_CMD_RESUME,
+    STREAM_CMD_MUTE,
+    STREAM_CMD_UNMUTE
+} stream_command_t;
+
+static stream_command_t stream_command_from_string(const char *name)
+{
+    if (zstr(name)) {
+        return STREAM_CMD_UNKNOWN;
+    }
+    if (!strcasecmp(name, "start")) {
+        return STREAM_CMD_START;
+    }
+    if (!strcasecmp(name, "stop")) {
+        return STREAM_CMD_STOP;
+    }
+    if (!strcasecmp(name, "send_json")) {
+        return STREAM_CMD_SEND_JSON;
+    }
+    if (!strcasecmp(name, "pause")) {
+        return STREAM_CMD_PAUSE;
+    }
+    if (!strcasecmp(name, "resume")) {
+        return STREAM_CMD_RESUME;
+    }
+    if (!strcasecmp(name, "mute")) {
+        return STREAM_CMD_MUTE;
+    }
+    if (!strcasecmp(name, "unmute")) {
+        return STREAM_CMD_UNMUTE;
+    }
+    return STREAM_CMD_UNKNOWN;
+}
+
 SWITCH_STANDARD_API(stream_function)
 {
-    char *mycmd = NULL, *argv[6] = { 0 };
+    char *mycmd = NULL, *argv[8] = { 0 };
     int argc = 0;
 
     switch_status_t status = SWITCH_STATUS_FALSE;
@@ -154,40 +230,55 @@ SWITCH_STANDARD_API(stream_function)
     }
     assert(cmd);
 
-    if (zstr(cmd) || argc < 2 || (0 == strcmp(argv[1], "start") && argc < 4)) {
+    if (zstr(cmd) || argc < 2) {
         switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s.\n", cmd);
-        stream->write_function(stream, "-USAGE: %s\n", STREAM_API_SYNTAX);
+        stream->write_function(stream, "%s\n", STREAM_API_SYNTAX);
         goto done;
-    } else {
-        if (strcasecmp(argv[1], "send_json")) { 
-            switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "mod_openai_audio_stream cmd: %s\n", cmd ? cmd : "");
-        }
-        switch_core_session_t *lsession = NULL;
-        if ((lsession = switch_core_session_locate(argv[0]))) {
-            if (!strcasecmp(argv[1], "stop")) {
-                if(argc > 2 && (is_valid_utf8(argv[2]) != SWITCH_STATUS_SUCCESS)) {
+    }
+
+    stream_command_t command = stream_command_from_string(argv[1]);
+
+    if (command != STREAM_CMD_SEND_JSON) {
+        switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "mod_openai_audio_stream cmd: %s\n", cmd ? cmd : "");
+    }
+
+    switch_core_session_t *lsession = NULL;
+    if ((lsession = switch_core_session_locate(argv[0]))) {
+        switch (command) {
+            case STREAM_CMD_STOP:
+                if (argc > 2 && (is_valid_utf8(argv[2]) != SWITCH_STATUS_SUCCESS)) {
                     switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
                                       "%s contains invalid utf8 characters\n", argv[2]);
-                    switch_core_session_rwunlock(lsession);
-                    goto done;
+                    goto release_session;
                 }
                 status = do_stop(lsession, argc > 2 ? argv[2] : NULL);
-            } else if (!strcasecmp(argv[1], "pause")) {
+                break;
+            case STREAM_CMD_PAUSE:
                 status = do_pauseresume(lsession, 1);
-            } else if (!strcasecmp(argv[1], "resume")) {
+                break;
+            case STREAM_CMD_RESUME:
                 status = do_pauseresume(lsession, 0);
-            } else if (!strcasecmp(argv[1], "send_json")) {
+                break;
+            case STREAM_CMD_SEND_JSON:
                 if (argc < 3) {
                     switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
                                       "send_json requires an argument specifying json to send\n");
-                    switch_core_session_rwunlock(lsession);
-                    goto done;
+                    goto release_session;
                 }
                 status = send_json(lsession, argv[2]);
-            } else if (!strcasecmp(argv[1], "start")) {
-                //switch_channel_t *channel = switch_core_session_get_channel(lsession);
+                break;
+            case STREAM_CMD_START:
+            {
+                if (argc < 4) {
+                    switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
+                                      "Error with command %s.\n", cmd);
+                    stream->write_function(stream, "%s\n", STREAM_API_SYNTAX);
+                    goto release_session;
+                }
                 char wsUri[MAX_WS_URI];
                 int sampling = 8000;
+                const char *sampling_str = NULL;
+                switch_bool_t start_muted = SWITCH_FALSE;
                 switch_media_bug_flag_t flags = SMBF_READ_STREAM;
                 flags |= SMBF_WRITE_REPLACE;
                 if (0 == strcmp(argv[3], "mixed")) {
@@ -198,41 +289,65 @@ SWITCH_STANDARD_API(stream_function)
                 } else if (0 != strcmp(argv[3], "mono")) {
                     switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
                                       "invalid mix type: %s, must be mono, mixed, or stereo\n", argv[3]);
-                    switch_core_session_rwunlock(lsession);
-                    goto done;
+                    goto release_session;
                 }
                 if (argc > 4) {
-                    if (0 == strcmp(argv[4], "16k")) {
-                        sampling = 16000;
-                    } else if (0 == strcmp(argv[4], "8k")) {
-                        sampling = 8000;
-                    } else if (0 == strcmp(argv[4], "24k")) {
-                        sampling = 24000;
+                    int next_index = 4;
+                    if (!strcasecmp(argv[next_index], "mute_user")) {
+                        start_muted = SWITCH_TRUE;
                     } else {
-                        sampling = atoi(argv[4]);
+                        sampling_str = argv[next_index];
+                        if (0 == strcmp(sampling_str, "16k")) {
+                            sampling = 16000;
+                        } else if (0 == strcmp(sampling_str, "8k")) {
+                            sampling = 8000;
+                        } else if (0 == strcmp(sampling_str, "24k")) {
+                            sampling = 24000;
+                        } else {
+                            sampling = atoi(sampling_str);
+                        }
+                        next_index++;
+                        if (argc > next_index && !strcasecmp(argv[next_index], "mute_user")) {
+                            start_muted = SWITCH_TRUE;
+                        }
                     }
                 }
 
                 if (!validate_ws_uri(argv[2], &wsUri[0])) {
                     switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
                                       "invalid websocket uri: %s\n", argv[2]);
                 } else if (sampling % 8000 != 0) {
-                    switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
-                                      "invalid sample rate: %s\n", argv[4]);
+                    if (sampling_str) {
+                        switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
+                                          "invalid sample rate: %s\n", sampling_str);
+                    } else {
+                        switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
+                                          "invalid sample rate: %d\n", sampling);
+                    }
                 } else {
-                    status = start_capture(lsession, flags, wsUri, sampling);
+                    status = start_capture(lsession, flags, wsUri, sampling, start_muted);
                 }
-            } else {
+                break;
+            }
+            case STREAM_CMD_MUTE:
+            case STREAM_CMD_UNMUTE:
+            {
+                const char *target = (argc > 2) ? argv[2] : "user";
+                status = do_audio_mute(lsession, target, command == STREAM_CMD_MUTE ? 1 : 0);
+                break;
+            }
+            case STREAM_CMD_UNKNOWN:
+            default:
                 switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
                                   "unsupported mod_openai_audio_stream cmd: %s\n", argv[1]);
-            }
-
-
-            switch_core_session_rwunlock(lsession);
-        } else {
-            switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error locating session %s\n",
-                              argv[0]);
+                break;
         }
+
+release_session:
+        switch_core_session_rwunlock(lsession);
+    } else {
+        switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error locating session %s\n",
+                          argv[0]);
     }
 
     if (status == SWITCH_STATUS_SUCCESS) {
@@ -270,6 +385,8 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_openai_audio_stream_load)
     switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid stop");
     switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid pause");
     switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid resume");
+    switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid mute");
+    switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid unmute");
     switch_console_set_complete("add uuid_openai_audio_stream ::console::list_uuid send_json");
 
     switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "mod_openai_audio_stream API successfully loaded\n");

diff --git a/mod_openai_audio_stream.h b/mod_openai_audio_stream.h
@@ -30,13 +30,14 @@ struct private_data {
     int sampling;
     int channels;
     int audio_paused:1;
+    int user_audio_muted:1;
+    int openai_audio_muted:1;
     int close_requested:1;
     RingBuffer *buffer;
     switch_buffer_t *sbuffer;
     uint8_t *data;
     int rtp_packets;
     switch_buffer_t *playback_buffer;
-    switch_mutex_t *playback_mutex;
 };
 
 typedef struct private_data private_t;