diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml new file mode 100644 index 0000000..15416bc --- /dev/null +++ b/.github/workflows/checks.yml @@ -0,0 +1,75 @@ +name: Build & Static Checks + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main, develop ] + workflow_dispatch: + +jobs: + analyze: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + with: + submodules: 'recursive' + + - name: Set up Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build SDK image (cached) + uses: docker/build-push-action@v6 + with: + context: . + file: Dockerfile.ci + tags: freeswitch-sdk:ci + load: true + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Run analysis inside container + uses: addnab/docker-run-action@v3 + with: + image: freeswitch-sdk:ci + options: -v ${{ github.workspace }}:/work + run: | + set -eux + cd /work + git config --global --add safe.directory /work + cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + + scan-build --status-bugs cmake --build build -j"$(nproc)" + + FILES="$(git ls-files '*.c' '*.cc' '*.cpp' '*.cxx' | grep -v '^buffer/' | grep -v '^libs/')" + if [ -n "$FILES" ]; then + clang-tidy -p build $FILES \ + --warnings-as-errors='clang-analyzer-*,bugprone-*,performance-*' + else + echo "No source files found for clang-tidy analysis." + fi + + + cppcheck --enable=warning,performance,portability --std=c++17 --force \ + --project=build/compile_commands.json \ + --suppress=missingIncludeSystem \ + -i build -i buffer -i libs 2> cppcheck-warn.log + + cppcheck --enable=style --std=c++17 --force \ + --project=build/compile_commands.json \ + --suppress=missingIncludeSystem \ + -i build -i buffer -i libs 2> cppcheck-style.log || true + + if [ -s cppcheck-style.log ]; then + echo "Style issues found by cppcheck:" + cat cppcheck-style.log + else + echo "No style issues found by cppcheck." + fi diff --git a/Dockerfile.ci b/Dockerfile.ci new file mode 100644 index 0000000..58c18f4 --- /dev/null +++ b/Dockerfile.ci @@ -0,0 +1,81 @@ +# syntax=docker/dockerfile:1.7 + +############################ +# Stage 1: Build dependencies + FreeSWITCH +############################ +FROM debian:12 AS builder + +ENV DEBIAN_FRONTEND=noninteractive + + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates git curl wget \ + build-essential cmake automake autoconf libtool libtool-bin libltdl-dev pkg-config \ + libssl-dev zlib1g-dev libdb-dev unixodbc-dev libncurses5-dev libexpat1-dev \ + libgdbm-dev bison erlang-dev libtpl-dev libtiff5-dev uuid-dev \ + libpcre3-dev libpcre2-dev libedit-dev libsqlite3-dev libcurl4-openssl-dev nasm \ + libogg-dev libspeex-dev libspeexdsp-dev libldns-dev python3-dev \ + libavformat-dev libswscale-dev libswresample-dev \ + liblua5.2-dev libopus-dev libpq-dev \ + libsndfile1-dev libflac-dev libvorbis-dev \ + && rm -rf /var/lib/apt/lists/* + + +WORKDIR /src + +RUN git clone https://github.com/signalwire/libks && \ + git clone https://github.com/freeswitch/sofia-sip && \ + git clone https://github.com/freeswitch/spandsp && \ + git clone https://github.com/signalwire/signalwire-c && \ + git clone https://github.com/signalwire/freeswitch + +# libks +WORKDIR /src/libks +RUN cmake . -DCMAKE_INSTALL_PREFIX=/usr -DWITH_LIBBACKTRACE=1 && \ + make -j"$(nproc)" && make install + +# sofia-sip +WORKDIR /src/sofia-sip +RUN ./bootstrap.sh && \ + ./configure --with-pic --with-glib=no --without-doxygen --disable-stun --prefix=/usr && \ + make -j"$(nproc)" && make install + +# spandsp +WORKDIR /src/spandsp +RUN ./bootstrap.sh && \ + ./configure --with-pic --prefix=/usr && \ + make -j"$(nproc)" && make install + +# signalwire-c +WORKDIR /src/signalwire-c +RUN PKG_CONFIG_PATH=/usr/lib/pkgconfig cmake . -DCMAKE_INSTALL_PREFIX=/usr && \ + make -j"$(nproc)" && make install + +# FreeSWITCH SDK +WORKDIR /src/freeswitch +RUN ./bootstrap.sh -j && \ + ./configure --prefix=/usr && \ + make -j"$(nproc)" && make install + +############################ +# Stage 2: Slim SDK image (no FreeSWITCH runtime) +############################ +FROM debian:12 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + clang clang-tidy clang-tools \ + cppcheck cmake pkg-config ccache \ + libssl-dev zlib1g-dev \ + libspeexdsp-dev libspandsp-dev \ + git curl wget \ + && rm -rf /var/lib/apt/lists/* + +# Copy only SDK bits +COPY --from=builder /usr/include/freeswitch/ /usr/include/freeswitch/ +COPY --from=builder /usr/lib/pkgconfig/freeswitch.pc /usr/lib/pkgconfig/ +COPY --from=builder /usr/lib/libfreeswitch.so* /usr/lib/ + +WORKDIR /work diff --git a/README.md b/README.md index 183191f..9ee293c 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,13 @@ # mod_openai_audio_stream -A fork of [mod_audio_stream](https://github.com/amigniter/mod_audio_stream) specifically designed for streaming audio to OpenAI's realtime API and playing the responses back to the user via FreeSWITCH and WebSocket. -**mod_openai_audio_stream** is a FreeSWITCH module that streams L16 audio from a channel to an OpenAI realtime websocket endpoint. The stream is adherent to OpenAI's Realtime API specification and allows for real-time audio playback directly in the channel. +![Build & Static Code Checks](https://github.com/VoiSmart/mod_openai_audio_stream/actions/workflows/checks.yml/badge.svg) + +**mod_openai_audio_stream** is a FreeSWITCH module that streams L16 audio from a channel to an OpenAI Realtime WebSocket endpoint. The stream follows OpenAI's Realtime API specification and enables real-time audio playback directly in the channel. + +It is a fork of [mod_audio_stream](https://github.com/amigniter/mod_audio_stream), specifically adapted for streaming audio to OpenAI's Realtime API and playing the responses back to the user via FreeSWITCH and WebSocket. + +The goal of **mod_openai_audio_stream** is to provide a simple, lightweight, yet effective module for streaming audio and receiving responses directly from OpenAI’s Realtime WebSocket into the call through FreeSWITCH. It uses [ixwebsocket](https://machinezone.github.io/IXWebSocket/), a C++ WebSocket library compiled as a static library. -The purpose of **mod_openai_audio_stream** was to make a simple, less dependent but yet effective module to stream audio and receive responses directly from OpenAI realtime websocket into the call via switch. It uses [ixwebsocket](https://machinezone.github.io/IXWebSocket/), c++ library for websocket protocol which is compiled as a static library. ## Notes @@ -72,6 +76,8 @@ The following channel variables can be used to fine tune websocket connection an - `Buffer Size` actually represents a duration of audio chunk sent to websocket. If you want to send e.g. 100ms audio packets to your ws endpoint you would set this variable to 100. If ommited, default packet size of 20ms will be sent as grabbed from the audio channel (which is default FreeSWITCH frame size) - Set `STREAM_OPENAI_API_KEY` to have a valid OpenAI API key to authenticate with OpenAI's Realtime API. This is required for the module to function properly. If not set the module will use the `STREAM_EXTRA_HEADERS` to pass the OpenAI API key as a header assuming you prepared the headers in the channel variable. **NOTE**: An OpenAI API key is required for the module to function properly. If not set, the module will not be able to connect to the API. +- You can specify the OpenAI Realtime model in the URI, e.g. `uuid_openai_audio_stream ${uuid} start wss://api.openai.com/v1/realtime?model=gpt-4o-mini-realtime-preview-2024-12-17 mono 24k` + - Extra headers should be a JSON object with key-value pairs representing additional HTTP headers. Each key should be a header name, and its corresponding value should be a string. ```json { @@ -107,6 +113,7 @@ Attaches a media bug and starts streaming audio (in L16 format) to the websocket - "8k" = 8000 Hz sample rate will be generated - "16k" = 16000 Hz sample rate will be generated - "24k" = 24000 Hz sample rate will be generated +- **IMPORTANT NOTE**: The OpenAI Realtime API, when using PCM audio format, expects the audio to be in 24 kHz sample rate. Use the sampling-rate parameter as `24k` (or `24000`) and mono to ensure that the audio is sent in the correct format. From the OpenAI Realtime API documentation: *input audio must be 16-bit PCM at a 24kHz sample rate, single channel (mono), and little-endian byte order.* Support for exchanging audio with OpenAI in other formats may be developed in the future, which would make the `` and ` send_json diff --git a/mod_openai_audio_stream.c b/mod_openai_audio_stream.c index b427649..fd6a246 100644 --- a/mod_openai_audio_stream.c +++ b/mod_openai_audio_stream.c @@ -27,7 +27,6 @@ static switch_bool_t capture_callback(switch_media_bug_t *bug, void *user_data, switch (type) { case SWITCH_ABC_TYPE_INIT: - const char *uuid = switch_core_session_get_uuid(session); break; case SWITCH_ABC_TYPE_CLOSE: @@ -156,7 +155,7 @@ SWITCH_STANDARD_API(stream_function) assert(cmd); if (zstr(cmd) || argc < 2 || (0 == strcmp(argv[1], "start") && argc < 4)) { - switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s %s %s.\n", cmd, argv[0], argv[1]); + switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Error with command %s.\n", cmd); stream->write_function(stream, "-USAGE: %s\n", STREAM_API_SYNTAX); goto done; } else { diff --git a/openai_audio_streamer_glue.cpp b/openai_audio_streamer_glue.cpp index a9340d1..a8babf4 100644 --- a/openai_audio_streamer_glue.cpp +++ b/openai_audio_streamer_glue.cpp @@ -194,36 +194,51 @@ class AudioStreamer { } } + std::vector resampleRawAudio(const std::string& input_raw) { - size_t in_samples = input_raw.size() / 2; - size_t out_samples = static_cast(in_samples * out_sample_rate / static_cast(in_sample_rate)) + 1; + + double scaled = static_cast(in_samples) * out_sample_rate / in_sample_rate; + size_t out_samples = static_cast(scaled) + 1; std::vector in_buffer(in_samples); std::vector out_buffer(out_samples); std::memcpy(in_buffer.data(), input_raw.data(), input_raw.size()); - spx_uint32_t in_len = in_samples; - spx_uint32_t out_len = out_samples; + if (in_samples > UINT32_MAX || out_samples > UINT32_MAX) { + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, + "Too many samples to resample: in=%zu, out=%zu\n", + in_samples, out_samples); + return {}; + } + + spx_uint32_t in_len = static_cast(in_samples); + spx_uint32_t out_len = static_cast(out_samples); - int err = speex_resampler_process_int(m_resampler, 0, in_buffer.data(), &in_len, out_buffer.data(), &out_len); + int err = speex_resampler_process_int(m_resampler, 0, + in_buffer.data(), &in_len, + out_buffer.data(), &out_len); if (err != RESAMPLER_ERR_SUCCESS) { - switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Resampling failed with error code: %d\n", err); - return std::vector(); // return empty vector on error + switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, + "Resampling failed with error code: %d\n", err); + return {}; // empty on error } - out_buffer.resize(out_len); // resize to actual size used + out_buffer.resize(out_len); // resize to actual resampled size return out_buffer; } - std::string createWavFromRaw(std::string rawAudio) { + // create wav file from raw audio + // rawAudio passed as constant reference because it is never edited + std::string createWavFromRaw(const std::string& rawAudio) { + const int numChannels = 1; // mono const int bitsPerSample = 16; // pcm16 int byteRate = in_sample_rate * numChannels * bitsPerSample / 8; int blockAlign = numChannels * bitsPerSample / 8; - uint32_t dataSize = rawAudio.size(); + uint32_t dataSize = static_cast(rawAudio.size()); uint32_t chunkSize = 36 + dataSize; std::ostringstream wavStream; // write in string like stream @@ -449,8 +464,10 @@ namespace { memset(tech_pvt, 0, sizeof(private_t)); - strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID); - strncpy(tech_pvt->ws_uri, wsUri, MAX_WS_URI); + strncpy(tech_pvt->sessionId, switch_core_session_get_uuid(session), MAX_SESSION_ID - 1); + tech_pvt->sessionId[MAX_SESSION_ID - 1] = '\0'; + strncpy(tech_pvt->ws_uri, wsUri, MAX_WS_URI - 1); + tech_pvt->ws_uri[MAX_WS_URI - 1] = '\0'; tech_pvt->sampling = desiredSampling; tech_pvt->responseHandler = responseHandler; tech_pvt->rtp_packets = rtp_packets; @@ -480,7 +497,7 @@ namespace { switch_mutex_init(&tech_pvt->mutex, SWITCH_MUTEX_NESTED, pool); - if (desiredSampling != sampling) { + if (static_cast(desiredSampling) != sampling) { if (switch_buffer_create(pool, &tech_pvt->sbuffer, buflen) != SWITCH_STATUS_SUCCESS) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "%s: Error creating switch buffer.\n", tech_pvt->sessionId); @@ -510,7 +527,7 @@ namespace { ringBufferInit(tech_pvt->buffer, tech_pvt->data, adjSize); } - if (desiredSampling != sampling) { + if (static_cast(desiredSampling) != sampling) { switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%s) resampling from %u to %u\n", tech_pvt->sessionId, sampling, desiredSampling); tech_pvt->resampler = speex_resampler_init(channels, sampling, desiredSampling, SWITCH_RESAMPLE_QUALITY, &err); if (0 != err) { @@ -559,17 +576,14 @@ namespace { extern "C" { int validate_ws_uri(const char* url, char* wsUri) { - const char* scheme = nullptr; const char* hostStart = nullptr; const char* hostEnd = nullptr; const char* portStart = nullptr; // Check scheme if (strncmp(url, "ws://", 5) == 0) { - scheme = "ws"; hostStart = url + 5; } else if (strncmp(url, "wss://", 6) == 0) { - scheme = "wss"; hostStart = url + 6; } else { return 0; @@ -728,7 +742,7 @@ extern "C" { int channels, void **ppUserData) { - int deflate, heart_beat; + int deflate = 0, heart_beat = 0; bool suppressLog = false; const char* buffer_size; const char* extra_headers; @@ -957,8 +971,6 @@ extern "C" { return SWITCH_TRUE; } - uint32_t available = switch_buffer_inuse(tech_pvt->playback_buffer); - uint32_t bytes_needed = frame->datalen; uint32_t bytes_per_sample = frame->datalen / frame->samples; @@ -1003,7 +1015,9 @@ extern "C" { { auto* tech_pvt = (private_t*) switch_core_media_bug_get_user_data(bug); char sessionId[MAX_SESSION_ID]; - strcpy(sessionId, tech_pvt->sessionId); + + strncpy(sessionId, tech_pvt->sessionId, MAX_SESSION_ID - 1); + sessionId[MAX_SESSION_ID - 1] = '\0'; switch_mutex_lock(tech_pvt->mutex); switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%s) stream_session_cleanup\n", sessionId);