VoiSmart · dariopellegrino00 · Aug 20, 2025 · Jul 30, 2025 · Jul 30, 2025 · Copilot
diff --git a/README.md b/README.md
@@ -72,6 +72,8 @@ The following channel variables can be used to fine tune websocket connection an
 - `Buffer Size` actually represents a duration of audio chunk sent to websocket. If you want to send e.g. 100ms audio packets to your ws endpoint
 you would set this variable to 100. If ommited, default packet size of 20ms will be sent as grabbed from the audio channel (which is default FreeSWITCH frame size)
 - Set `STREAM_OPENAI_API_KEY` to have a valid OpenAI API key to authenticate with OpenAI's Realtime API. This is required for the module to function properly. If not set the module will use the `STREAM_EXTRA_HEADERS` to pass the OpenAI API key as a header assuming you prepared the headers in the channel variable. **NOTE**: An OpenAI API key is required for the module to function properly. If not set, the module will not be able to connect to the API.
+- You can specify the OpenAI Realtime model in the URI, e.g. `uuid_openai_audio_stream ${uuid} start wss://api.openai.com/v1/realtime?model=gpt-4o-mini-realtime-preview-2024-12-17 mono 24k`
+
 - Extra headers should be a JSON object with key-value pairs representing additional HTTP headers. Each key should be a header name, and its corresponding value should be a string.
   ```json
   {
@@ -107,6 +109,7 @@ Attaches a media bug and starts streaming audio (in L16 format) to the websocket
   - "8k" = 8000 Hz sample rate will be generated
   - "16k" = 16000 Hz sample rate will be generated
   - "24k" = 24000 Hz sample rate will be generated
+- **IMPORTANT NOTE**: The OpenAI Realtime API, when using PCM audio format, expects the audio to be in 24 kHz sample rate. Use the sampling-rate parameter as `24k` (or `24000`) and mono to ensure that the audio is sent in the correct format. From the OpenAI Realtime API documentation: *input audio must be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian byte order.* Support for exchanging audio with OpenAI in other formats may be developed in the future, which would make the `<sampling-rate>` and `<mono<` parameters useful for controlling the output format dynamically.
 
 ```
 uuid_openai_audio_stream <uuid> send_json

diff --git a/mod_openai_audio_stream.c b/mod_openai_audio_stream.c
@@ -27,7 +27,6 @@ static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data,
 
     switch (type) {
         case SWITCH_ABC_TYPE_INIT:
-            const char *uuid = switch_core_session_get_uuid(session);
             break;
 
         case SWITCH_ABC_TYPE_CLOSE:
@@ -156,7 +155,7 @@ SWITCH_STANDARD_API(stream_function)
     assert(cmd);
 
     if (zstr(cmd) || argc < 2 || (0 == strcmp(argv[1], "start") && argc < 4)) {
-        switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
+        switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s.\n", cmd);
         stream->write_function(stream, "-USAGE: %s\n", STREAM_API_SYNTAX);
         goto done;
     } else {

diff --git a/openai_audio_streamer_glue.cpp b/openai_audio_streamer_glue.cpp
@@ -194,27 +194,39 @@ class AudioStreamer {
         }
     }
 
+
     std::vector<int16_t> resampleRawAudio(const std::string& input_raw) {
-
         size_t in_samples = input_raw.size() / 2;
-        size_t out_samples = static_cast<size_t>(in_samples * out_sample_rate / static_cast<float>(in_sample_rate)) + 1;
+
+        double scaled = static_cast<double>(in_samples) * out_sample_rate / in_sample_rate;
-        double scaled = static_cast<double>(in_samples) * out_sample_rate / in_sample_rate;
+        float scaled = static_cast<float>(in_samples) * out_sample_rate / in_sample_rate;
-        double scaled = static_cast<double>(in_samples) * out_sample_rate / in_sample_rate;
+        float scaled = static_cast<float>(in_samples) * out_sample_rate / in_sample_rate;
+        size_t out_samples = static_cast<size_t>(scaled) + 1;
 
         std::vector<int16_t> in_buffer(in_samples);
         std::vector<int16_t> out_buffer(out_samples);
 
         std::memcpy(in_buffer.data(), input_raw.data(), input_raw.size());
 
-        spx_uint32_t in_len = in_samples;
-        spx_uint32_t out_len = out_samples;
+        if (in_samples > UINT32_MAX || out_samples > UINT32_MAX) {
+            switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR,
+                              "Too many samples to resample: in=%zu, out=%zu\n",
+                              in_samples, out_samples);
+            return {};
+        }
+
+        spx_uint32_t in_len = static_cast<spx_uint32_t>(in_samples);
+        spx_uint32_t out_len = static_cast<spx_uint32_t>(out_samples);
 
-        int err = speex_resampler_process_int(m_resampler, 0, in_buffer.data(), &in_len, out_buffer.data(), &out_len);
+        int err = speex_resampler_process_int(m_resampler, 0,
+                                              in_buffer.data(), &in_len,
+                                              out_buffer.data(), &out_len);
 
         if (err != RESAMPLER_ERR_SUCCESS) {
-            switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Resampling failed with error code: %d\n", err);
-            return std::vector<int16_t>(); // return empty vector on error
+            switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR,
+                              "Resampling failed with error code: %d\n", err);
+            return {}; // empty on error
         }
 
-        out_buffer.resize(out_len);  // resize to actual size used
+        out_buffer.resize(out_len);  // resize to actual resampled size
         return out_buffer;
     }
 
@@ -223,7 +235,7 @@ class AudioStreamer {
         const int bitsPerSample = 16; // pcm16
         int byteRate = in_sample_rate * numChannels * bitsPerSample / 8;
         int blockAlign = numChannels * bitsPerSample / 8;
-        uint32_t dataSize = rawAudio.size();
+        uint32_t dataSize = static_cast<uint32_t>(rawAudio.size());
         uint32_t chunkSize = 36 + dataSize;
 
         std::ostringstream wavStream; // write in string like stream
@@ -449,8 +461,10 @@ namespace {
 
             memset(tech_pvt, 0, sizeof(private_t));
 
-            strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
-            strncpy(tech_pvt->ws_uri, wsUri, MAX_WS_URI);
+            strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID - 1);
+            tech_pvt->sessionId[MAX_SESSION_ID - 1] = '\0';
+            strncpy(tech_pvt->ws_uri, wsUri, MAX_WS_URI - 1);
+            tech_pvt->ws_uri[MAX_WS_URI - 1] = '\0';
-            tech_pvt->ws_uri[MAX_WS_URI - 1] = '\0';
+            snprintf(tech_pvt->sessionId, MAX_SESSION_ID, "%s", switch_core_session_get_uuid(session));
+            snprintf(tech_pvt->ws_uri, MAX_WS_URI, "%s", wsUri);
-            tech_pvt->ws_uri[MAX_WS_URI - 1] = '\0';
+            snprintf(tech_pvt->sessionId, MAX_SESSION_ID, "%s", switch_core_session_get_uuid(session));
+            snprintf(tech_pvt->ws_uri, MAX_WS_URI, "%s", wsUri);
             tech_pvt->sampling = desiredSampling;
             tech_pvt->responseHandler = responseHandler;
             tech_pvt->rtp_packets = rtp_packets;
@@ -480,7 +494,7 @@ namespace {
 
             switch_mutex_init(&tech_pvt->mutex, SWITCH_MUTEX_NESTED, pool);
 
-            if (desiredSampling != sampling) {
+            if (static_cast<uint32_t>(desiredSampling) != sampling) {
                 if (switch_buffer_create(pool, &tech_pvt->sbuffer, buflen) != SWITCH_STATUS_SUCCESS) {
                     switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
                         "%s: Error creating switch buffer.\n", tech_pvt->sessionId);
@@ -510,7 +524,7 @@ namespace {
                 ringBufferInit(tech_pvt->buffer, tech_pvt->data, adjSize);
             }
 
-            if (desiredSampling != sampling) {
+            if (static_cast<uint32_t>(desiredSampling) != sampling) {
                 switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%s) resampling from %u to %u\n", tech_pvt->sessionId, sampling, desiredSampling);
                 tech_pvt->resampler = speex_resampler_init(channels, sampling, desiredSampling, SWITCH_RESAMPLE_QUALITY, &err);
                 if (0 != err) {
@@ -559,17 +573,14 @@ namespace {
 
 extern "C" {
     int validate_ws_uri(const char* url, char* wsUri) {
-        const char* scheme = nullptr;
         const char* hostStart = nullptr;
         const char* hostEnd = nullptr;
         const char* portStart = nullptr;
 
         // Check scheme
         if (strncmp(url, "ws://", 5) == 0) {
-            scheme = "ws";
             hostStart = url + 5;
         } else if (strncmp(url, "wss://", 6) == 0) {
-            scheme = "wss";
             hostStart = url + 6;
         } else {
             return 0;
@@ -728,7 +739,7 @@ extern "C" {
                                         int channels,
                                         void **ppUserData)
     {
-        int deflate, heart_beat;
+        int deflate = 0, heart_beat = 0;
         bool suppressLog = false;
         const char* buffer_size;
         const char* extra_headers;
@@ -957,8 +968,6 @@ extern "C" {
             return SWITCH_TRUE;
         }
 
-        uint32_t available = switch_buffer_inuse(tech_pvt->playback_buffer);
-
         uint32_t bytes_needed = frame->datalen;
         uint32_t bytes_per_sample = frame->datalen / frame->samples;