In [1]:
import pyaudio
import wave
import tensorflow as tf
import numpy as np
import os
from collections import deque
import datetime
from playsound3 import playsound
import threading
import simpleaudio as sa


In [24]:
# Settings
CHUNK = 160  # Buffer size
FORMAT = pyaudio.paInt16  # 16-bit resolution
CHANNELS = 1  # Mono audio
RATE = 16000  # sampling rate

def play_sound(path):
    try:
        wave_obj = sa.WaveObject.from_wave_file(path)
        wave_obj.play()  # non-blocking
    except Exception as e:
        print("[ERROR] Could not play sound:", e)


In [4]:
def load_wav_16k_mono_from_buffer(audio_bytes, dtype=np.int16):
    """
    Converts raw audio bytes from PyAudio buffer to float32 tensor
    equivalent to tf.audio.decode_wav(..., desired_channels=1)
    """
    # Convert bytes to int16 NumPy array
    audio_np = np.frombuffer(audio_bytes, dtype=dtype)

    # Normalize to float32 in [-1.0, 1.0]
    audio_float32 = audio_np.astype(np.float32) / 32768.0

    # Convert to TensorFlow tensor
    audio_tensor = tf.convert_to_tensor(audio_float32, dtype=tf.float32)

    return audio_tensor  # shape: [samples], dtype: float32

In [23]:


# -----------------------
SAMPLE_RATE = 16000
DURATION_SECONDS = 1.00        
NUM_SAMPLES = int(SAMPLE_RATE * DURATION_SECONDS)
NUM_MELS = 40
FRAME_LENGTH = 400
FRAME_STEP = 160                    # 10 ms step
FFT_LENGTH = 512
FMIN = 80.0
FMAX = 7600.0

BATCH_SIZE = 32
N_EPOCHS = 30
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [6]:
def preprocess_from_buffer(audio_tensor):
    """
    Convert a raw audio buffer to a normalized log-mel spectrogram
    compatible with the training pipeline.
    """
    # Ensure correct length (pad or trim to NUM_SAMPLES = 20000)
    audio_tensor = audio_tensor[:NUM_SAMPLES]
    pad_len = NUM_SAMPLES - tf.shape(audio_tensor)[0]
    audio_tensor = tf.pad(audio_tensor, [[0, pad_len]])

    # Pre-emphasis filter to boost high frequencies
    emphasized = tf.concat([audio_tensor[:1], audio_tensor[1:] - 0.97 * audio_tensor[:-1]], axis=0)

    # Compute STFT
    stft = tf.signal.stft(
        emphasized,
        frame_length=FRAME_LENGTH,
        frame_step=FRAME_STEP,
        fft_length=FFT_LENGTH,
        window_fn=tf.signal.hann_window
    )
    mag = tf.abs(stft)

    # Apply mel filterbank
    mel_weight = tf.signal.linear_to_mel_weight_matrix(
        NUM_MELS,
        mag.shape[-1],
        SAMPLE_RATE,
        FMIN,
        FMAX
    )
    mel = tf.matmul(mag, mel_weight)

    # Log scale and normalization
    log_mel = tf.math.log(mel + 1e-6)
    mean = tf.reduce_mean(log_mel)
    std = tf.math.reduce_std(log_mel) + 1e-6
    log_mel = (log_mel - mean) / std

    # Add channel dimension
    log_mel = tf.expand_dims(log_mel, axis=-1)

    return log_mel  # shape: [time_frames, NUM_MELS, 1]


In [43]:
model_path="/Users/sethwright/Documents/audio-model/output/sheila_float32.tflite"
# Load the TFLite model
interpreter = tf.lite.Interpreter(model_path)
interpreter.allocate_tensors()
# Get input and output tensor details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
input_shape = input_details[0]['shape']
print(f"Input shape: {input_shape}")

# Print input/output details
print("Input details:", input_details )
print("Output details:", output_details)


Input shape: [ 1 98 40  1]
Input details: [{'name': 'serving_default_input_3:0', 'index': 0, 'shape': array([ 1, 98, 40,  1], dtype=int32), 'shape_signature': array([-1, 98, 40,  1], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
Output details: [{'name': 'StatefulPartitionedCall:0', 'index': 33, 'shape': array([1, 1], dtype=int32), 'shape_signature': array([-1,  1], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]


In [28]:

#run real time predictions with keras model
def test_audio_tflite():
    # Initialize PyAudio
    audio = pyaudio.PyAudio() 
    audio_buffer = deque(maxlen=16000)


    # Sliding window for smoothing predictions
    OUT_DIR = ("/Users/sethwright/Documents/audio-model/data")
    window = deque(maxlen=3)
    THRESHOLD = 0.5  # Adjust this to be stricter
    FALSE_POS_DIR = os.path.join(OUT_DIR, "false_positives")
    os.makedirs(FALSE_POS_DIR, exist_ok=True)
    
    # List input devices
    #print("Available input devices:")
    for i in range(audio.get_device_count()):
        info = audio.get_device_info_by_index(i)
        #if info["maxInputChannels"] > 0:
        #print(f"  [{i}] {info['name']}")
    # Open stream
    stream = audio.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)
    print("listening started")
    
    while True:
        data = stream.read(CHUNK, exception_on_overflow=False)
        samples = np.frombuffer(data, dtype=np.int16).astype(np.float32)
        audio_buffer.extend(samples)
    
        if len(audio_buffer) == NUM_SAMPLES:
            wav_tensor = np.array(audio_buffer, dtype=np.float32)
            spectrogram = preprocess_from_buffer(wav_tensor)
            spectrogram = tf.expand_dims(spectrogram, axis=0)

            interpreter.set_tensor(input_details[0]['index'], spectrogram)
            interpreter.invoke()
            pred = interpreter.get_tensor(output_details[0]['index'])[0][0]

            # Update sliding window
            window.append(pred)
    
            # Trigger only if all predictions in window exceed threshold
            if len(window) == 3 and all(p > THRESHOLD for p in window):
                print("Wake word detected!")
                print(f"Model raw output: {pred:.4f}")
                #playsound("/Users/sethwright/Downloads/gong.mp3")
                play_sound("/Users/sethwright/Downloads/gong.wav")

                # Save detected audio
                timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                filename = os.path.join(OUT_DIR, f"Positive_{timestamp}_{pred:.4f}.wav")
                with wave.open(filename, 'wb') as wf:
                    wf.setnchannels(CHANNELS)
                    wf.setsampwidth(audio.get_sample_size(FORMAT))
                    wf.setframerate(RATE)
                    wf.writeframes(np.array(audio_buffer, dtype=np.int16).tobytes())
                print(f"Audio saved: {filename}")
    
                # Clear buffer for next detection
                audio_buffer.clear()
                window.clear()
                stream.stop_stream()
                stream.close()
                audio.terminate()
                break


In [19]:
# Load trained Keras model
# -----------------------
MODEL_PATH="/Users/sethwright/Documents/audio-model/output/saved_model.keras"
model = tf.keras.models.load_model(MODEL_PATH, compile =False, safe_mode=False)
print("✅ Model loaded:", MODEL_PATH)

✅ Model loaded: /Users/sethwright/Documents/audio-model/output/saved_model.keras


In [39]:
#run real time predictions with keras model
def test_audio():
    # Initialize PyAudio
    audio = pyaudio.PyAudio() 
    audio_buffer = deque(maxlen=20000)


    # Sliding window for smoothing predictions
    OUT_DIR = ("/Users/sethwright/Documents/audio-model/data")
    window = deque(maxlen=3)
    THRESHOLD = 0.8  # Adjust this to be stricter
    FALSE_POS_DIR = os.path.join(OUT_DIR, "false_positives")
    os.makedirs(FALSE_POS_DIR, exist_ok=True)
    
    # List input devices
    #print("Available input devices:")
    for i in range(audio.get_device_count()):
        info = audio.get_device_info_by_index(i)
        #if info["maxInputChannels"] > 0:
        #print(f"  [{i}] {info['name']}")
    # Open stream
    stream = audio.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)
    print("listening started")
    
    while True:
        data = stream.read(CHUNK, exception_on_overflow=False)
        samples = np.frombuffer(data, dtype=np.int16).astype(np.float32)
        audio_buffer.extend(samples)
    
        if len(audio_buffer) == NUM_SAMPLES:
            wav_tensor = np.array(audio_buffer, dtype=np.float32)
            spectrogram = preprocess_from_buffer(wav_tensor)
            spectrogram = tf.expand_dims(spectrogram, axis=0)
    
            pred = model.predict(spectrogram, verbose=0)[0][0]
    
            # Update sliding window
            window.append(pred)
    
            # Trigger only if all predictions in window exceed threshold
            if len(window) == 3 and all(p > THRESHOLD for p in window):
                print("Wake word detected!")
                print(f"Model raw output: {pred:.4f}")
                #playsound("/Users/sethwright/Downloads/gong.mp3")
                play_sound("/Users/sethwright/Downloads/gong.wav")

                # Save detected audio
                timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                filename = os.path.join(OUT_DIR, f"Positive_{timestamp}_{pred:.4f}.wav")
                with wave.open(filename, 'wb') as wf:
                    wf.setnchannels(CHANNELS)
                    wf.setsampwidth(audio.get_sample_size(FORMAT))
                    wf.setframerate(RATE)
                    wf.writeframes(np.array(audio_buffer, dtype=np.int16).tobytes())
                print(f"Audio saved: {filename}")
    
                # Clear buffer for next detection
                audio_buffer.clear()
                window.clear()
                
    '''
            # Save false positives automatically
            elif len(window) == 3 and all(p > THRESHOLD for p in window) == False and pred > 0.5:
                # Example: false positive detected
                timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                filename = os.path.join(FALSE_POS_DIR, f"FalsePositive_{timestamp}_{pred:.4f}.wav")
                with wave.open(filename, 'wb') as wf:
                    wf.setnchannels(CHANNELS)
                    wf.setsampwidth(audio.get_sample_size(FORMAT))
                    wf.setframerate(RATE)
                    wf.writeframes(np.array(audio_buffer, dtype=np.int16).tobytes())
                print(f"False positive saved: {filename}")
            '''

In [40]:
test_audio_tflite()

listening started
Wake word detected!
Model raw output: 0.9758
Audio saved: /Users/sethwright/Documents/audio-model/data/Positive_20251123_180344_0.9758.wav


In [19]:
import os
import wave
import contextlib

# Path to your directory containing audio files
audio_dir = "/Users/sethwright/Documents/audio-model/data/sheila"

durations = []

# Loop through all files in the directory
for filename in os.listdir(audio_dir):
    if filename.endswith(".wav"):  # Change this if your files have a different extension
        file_path = os.path.join(audio_dir, filename)
        with contextlib.closing(wave.open(file_path, 'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
            duration = frames / float(rate)
            durations.append(duration)

if durations:
    smallest = min(durations)
    largest = max(durations)
    average = sum(durations) / len(durations)
    print(f"Smallest duration: {smallest:.2f} seconds")
    print(f"Largest duration: {largest:.2f} seconds")
    print(f"Average duration: {average:.2f} seconds")
else:
    print("No audio files found in the directory.")


for filename in os.listdir(audio_dir):
    if filename.endswith(".wav"):  # only process WAV files
        file_path = os.path.join(audio_dir, filename)
        with contextlib.closing(wave.open(file_path, 'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
            duration = frames / float(rate)
            if duration < .6:
                print(f"Deleting {filename} (duration: {duration:.2f}s)")
                os.remove(file_path)

Smallest duration: 0.64 seconds
Largest duration: 1.00 seconds
Average duration: 0.99 seconds
