Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ The following channel variables can be used to fine tune websocket connection an
- `Buffer Size` actually represents a duration of audio chunk sent to websocket. If you want to send e.g. 100ms audio packets to your ws endpoint
you would set this variable to 100. If ommited, default packet size of 20ms will be sent as grabbed from the audio channel (which is default FreeSWITCH frame size)
- Set `STREAM_OPENAI_API_KEY` to have a valid OpenAI API key to authenticate with OpenAI's Realtime API. This is required for the module to function properly. If not set the module will use the `STREAM_EXTRA_HEADERS` to pass the OpenAI API key as a header assuming you prepared the headers in the channel variable. **NOTE**: An OpenAI API key is required for the module to function properly. If not set, the module will not be able to connect to the API.
- You can specify the OpenAI Realtime model in the URI, e.g. `uuid_openai_audio_stream ${uuid} start wss://api.openai.com/v1/realtime?model=gpt-4o-mini-realtime-preview-2024-12-17 mono 24k`

- Extra headers should be a JSON object with key-value pairs representing additional HTTP headers. Each key should be a header name, and its corresponding value should be a string.
```json
{
Expand Down Expand Up @@ -107,6 +109,7 @@ Attaches a media bug and starts streaming audio (in L16 format) to the websocket
- "8k" = 8000 Hz sample rate will be generated
- "16k" = 16000 Hz sample rate will be generated
- "24k" = 24000 Hz sample rate will be generated
- **IMPORTANT NOTE**: The OpenAI Realtime API, when using PCM audio format, expects the audio to be in 24 kHz sample rate. Use the sampling-rate parameter as `24k` (or `24000`) and mono to ensure that the audio is sent in the correct format. From the OpenAI Realtime API documentation: *input audio must be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian byte order.* Support for exchanging audio with OpenAI in other formats may be developed in the future, which would make the `<sampling-rate>` and `<mono<` parameters useful for controlling the output format dynamically.

```
uuid_openai_audio_stream <uuid> send_json
Expand Down
3 changes: 1 addition & 2 deletions mod_openai_audio_stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data,

switch (type) {
case SWITCH_ABC_TYPE_INIT:
const char *uuid = switch_core_session_get_uuid(session);
break;

case SWITCH_ABC_TYPE_CLOSE:
Expand Down Expand Up @@ -156,7 +155,7 @@ SWITCH_STANDARD_API(stream_function)
assert(cmd);

if (zstr(cmd) || argc < 2 || (0 == strcmp(argv[1], "start") && argc < 4)) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]);
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s.\n", cmd);
stream->write_function(stream, "-USAGE: %s\n", STREAM_API_SYNTAX);
goto done;
} else {
Expand Down
47 changes: 28 additions & 19 deletions openai_audio_streamer_glue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,27 +194,39 @@ class AudioStreamer {
}
}


std::vector<int16_t> resampleRawAudio(const std::string& input_raw) {

size_t in_samples = input_raw.size() / 2;
size_t out_samples = static_cast<size_t>(in_samples * out_sample_rate / static_cast<float>(in_sample_rate)) + 1;

double scaled = static_cast<double>(in_samples) * out_sample_rate / in_sample_rate;
Copy link

Copilot AI Aug 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The calculation uses double precision floating point which may be overkill for sample rate conversion. Consider using float precision to improve performance, especially since audio sample rates are typically integers and don't require double precision accuracy.

Suggested change
double scaled = static_cast<double>(in_samples) * out_sample_rate / in_sample_rate;
float scaled = static_cast<float>(in_samples) * out_sample_rate / in_sample_rate;

Copilot uses AI. Check for mistakes.
size_t out_samples = static_cast<size_t>(scaled) + 1;

std::vector<int16_t> in_buffer(in_samples);
std::vector<int16_t> out_buffer(out_samples);

std::memcpy(in_buffer.data(), input_raw.data(), input_raw.size());

spx_uint32_t in_len = in_samples;
spx_uint32_t out_len = out_samples;
if (in_samples > UINT32_MAX || out_samples > UINT32_MAX) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR,
"Too many samples to resample: in=%zu, out=%zu\n",
in_samples, out_samples);
return {};
}

spx_uint32_t in_len = static_cast<spx_uint32_t>(in_samples);
spx_uint32_t out_len = static_cast<spx_uint32_t>(out_samples);

int err = speex_resampler_process_int(m_resampler, 0, in_buffer.data(), &in_len, out_buffer.data(), &out_len);
int err = speex_resampler_process_int(m_resampler, 0,
in_buffer.data(), &in_len,
out_buffer.data(), &out_len);

if (err != RESAMPLER_ERR_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Resampling failed with error code: %d\n", err);
return std::vector<int16_t>(); // return empty vector on error
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR,
"Resampling failed with error code: %d\n", err);
return {}; // empty on error
}

out_buffer.resize(out_len); // resize to actual size used
out_buffer.resize(out_len); // resize to actual resampled size
return out_buffer;
}

Expand All @@ -223,7 +235,7 @@ class AudioStreamer {
const int bitsPerSample = 16; // pcm16
int byteRate = in_sample_rate * numChannels * bitsPerSample / 8;
int blockAlign = numChannels * bitsPerSample / 8;
uint32_t dataSize = rawAudio.size();
uint32_t dataSize = static_cast<uint32_t>(rawAudio.size());
uint32_t chunkSize = 36 + dataSize;

std::ostringstream wavStream; // write in string like stream
Expand Down Expand Up @@ -449,8 +461,10 @@ namespace {

memset(tech_pvt, 0, sizeof(private_t));

strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID);
strncpy(tech_pvt->ws_uri, wsUri, MAX_WS_URI);
strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID - 1);
tech_pvt->sessionId[MAX_SESSION_ID - 1] = '\0';
strncpy(tech_pvt->ws_uri, wsUri, MAX_WS_URI - 1);
tech_pvt->ws_uri[MAX_WS_URI - 1] = '\0';
Copy link

Copilot AI Aug 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While the null termination fix is good, consider using safer string copy functions like strlcpy or snprintf instead of strncpy to avoid potential buffer overflow issues and ensure proper null termination in all cases.

Suggested change
tech_pvt->ws_uri[MAX_WS_URI - 1] = '\0';
snprintf(tech_pvt->sessionId, MAX_SESSION_ID, "%s", switch_core_session_get_uuid(session));
snprintf(tech_pvt->ws_uri, MAX_WS_URI, "%s", wsUri);

Copilot uses AI. Check for mistakes.
tech_pvt->sampling = desiredSampling;
tech_pvt->responseHandler = responseHandler;
tech_pvt->rtp_packets = rtp_packets;
Expand Down Expand Up @@ -480,7 +494,7 @@ namespace {

switch_mutex_init(&tech_pvt->mutex, SWITCH_MUTEX_NESTED, pool);

if (desiredSampling != sampling) {
if (static_cast<uint32_t>(desiredSampling) != sampling) {
if (switch_buffer_create(pool, &tech_pvt->sbuffer, buflen) != SWITCH_STATUS_SUCCESS) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
"%s: Error creating switch buffer.\n", tech_pvt->sessionId);
Expand Down Expand Up @@ -510,7 +524,7 @@ namespace {
ringBufferInit(tech_pvt->buffer, tech_pvt->data, adjSize);
}

if (desiredSampling != sampling) {
if (static_cast<uint32_t>(desiredSampling) != sampling) {
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%s) resampling from %u to %u\n", tech_pvt->sessionId, sampling, desiredSampling);
tech_pvt->resampler = speex_resampler_init(channels, sampling, desiredSampling, SWITCH_RESAMPLE_QUALITY, &err);
if (0 != err) {
Expand Down Expand Up @@ -559,17 +573,14 @@ namespace {

extern "C" {
int validate_ws_uri(const char* url, char* wsUri) {
const char* scheme = nullptr;
const char* hostStart = nullptr;
const char* hostEnd = nullptr;
const char* portStart = nullptr;

// Check scheme
if (strncmp(url, "ws://", 5) == 0) {
scheme = "ws";
hostStart = url + 5;
} else if (strncmp(url, "wss://", 6) == 0) {
scheme = "wss";
hostStart = url + 6;
} else {
return 0;
Expand Down Expand Up @@ -728,7 +739,7 @@ extern "C" {
int channels,
void **ppUserData)
{
int deflate, heart_beat;
int deflate = 0, heart_beat = 0;
bool suppressLog = false;
const char* buffer_size;
const char* extra_headers;
Expand Down Expand Up @@ -957,8 +968,6 @@ extern "C" {
return SWITCH_TRUE;
}

uint32_t available = switch_buffer_inuse(tech_pvt->playback_buffer);

uint32_t bytes_needed = frame->datalen;
uint32_t bytes_per_sample = frame->datalen / frame->samples;

Expand Down