In [22]:
import pyaudio
import tensorflow as tf
import numpy as np
from collections import deque
from playsound3 import playsound




In [23]:
# Settings
CHUNK = 400  # Buffer size
FORMAT = pyaudio.paInt16  # 16-bit resolution
CHANNELS = 1  # Mono audio
RATE = 16000  # sampling rate


In [24]:
def load_wav_16k_mono_from_buffer(audio_bytes, dtype=np.int16):
    """
    Converts raw audio bytes from PyAudio buffer to float32 tensor
    equivalent to tf.audio.decode_wav(..., desired_channels=1)
    """
    # Convert bytes to int16 NumPy array
    audio_np = np.frombuffer(audio_bytes, dtype=dtype)

    # Normalize to float32 in [-1.0, 1.0]
    audio_float32 = audio_np.astype(np.float32) / 32768.0

    # Convert to TensorFlow tensor
    audio_tensor = tf.convert_to_tensor(audio_float32, dtype=tf.float32)

    return audio_tensor  # shape: [samples], dtype: float32

In [25]:
model_path="/Users/sethwright/Documents/audio-model/output/heychef_float32.tflite"
print(model_path)
SAMPLE_RATE = 16000
DURATION_SECONDS = 1.25             # ≈1.25 s
NUM_SAMPLES = int(SAMPLE_RATE * DURATION_SECONDS)  # 20000
NUM_MELS = 40
FRAME_LENGTH = 512
FRAME_STEP = 160                    # 10 ms step
FFT_LENGTH = 512
FMIN = 80.0
FMAX = 7600.0

BATCH_SIZE = 32
N_EPOCHS = 30
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

/Users/sethwright/Documents/audio-model/output/heychef_float32.tflite


In [26]:
def preprocess_from_buffer(audio_tensor):
    """
    Convert a raw audio buffer to a normalized log-mel spectrogram
    compatible with the training pipeline.
    """
    # Ensure correct length (pad or trim to NUM_SAMPLES = 20000)
    audio_tensor = audio_tensor[:NUM_SAMPLES]
    pad_len = NUM_SAMPLES - tf.shape(audio_tensor)[0]
    audio_tensor = tf.pad(audio_tensor, [[0, pad_len]])

    # Pre-emphasis filter to boost high frequencies
    emphasized = tf.concat([audio_tensor[:1], audio_tensor[1:] - 0.97 * audio_tensor[:-1]], axis=0)

    # Compute STFT
    stft = tf.signal.stft(
        emphasized,
        frame_length=FRAME_LENGTH,
        frame_step=FRAME_STEP,
        fft_length=FFT_LENGTH,
        window_fn=tf.signal.hann_window
    )
    mag = tf.abs(stft)

    # Apply mel filterbank
    mel_weight = tf.signal.linear_to_mel_weight_matrix(
        NUM_MELS,
        mag.shape[-1],
        SAMPLE_RATE,
        FMIN,
        FMAX
    )
    mel = tf.matmul(mag, mel_weight)

    # Log scale and normalization
    log_mel = tf.math.log(mel + 1e-6)
    mean = tf.reduce_mean(log_mel)
    std = tf.math.reduce_std(log_mel) + 1e-6
    log_mel = (log_mel - mean) / std

    # Add channel dimension
    log_mel = tf.expand_dims(log_mel, axis=-1)

    return log_mel  # shape: [time_frames, NUM_MELS, 1]


In [None]:
def preprocess_from_buffer_old(audio_tensor):
    target_len = 20000
    audio_tensor = audio_tensor[:target_len]
    zero_padding = tf.zeros([target_len] - tf.shape(audio_tensor), dtype=tf.float32)
    audio_tensor = tf.concat([audio_tensor, zero_padding], axis=0)

    spectrogram = tf.signal.stft(audio_tensor, frame_length=512, frame_step=160)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)

    return spectrogram  # shape: [?]


In [None]:
# Load the TFLite model
interpreter = tf.lite.Interpreter(model_path)
interpreter.allocate_tensors()
# Get input and output tensor details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Print input/output details
print("Input details:", input_details)
print("Output details:", output_details)


In [None]:
# Initialize PyAudio
audio = pyaudio.PyAudio() 
audio_buffer = deque(maxlen=20000)

# List input devices
print("Available input devices:")
for i in range(audio.get_device_count()):
    info = audio.get_device_info_by_index(i)
    if info["maxInputChannels"] > 0:
        print(f"  [{i}] {info['name']}")
# Open stream
stream = audio.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)
while True:
    data = stream.read(CHUNK, exception_on_overflow=False)
    samples = np.frombuffer(data, dtype=np.int16).astype(np.float32)
    audio_buffer.extend(samples)

    if len(audio_buffer) == 20000:
        wav_tensor = np.array(audio_buffer, dtype=np.float32)
        spectrogram = preprocess_from_buffer(wav_tensor)
        spectrogram = tf.expand_dims(spectrogram, axis=0)

        interpreter.set_tensor(input_details[0]['index'], spectrogram)
        interpreter.invoke()
        output = interpreter.get_tensor(output_details[0]['index'])[0][0]

        if output > 0.5:
            print("Wake word detected!")
        

Available input devices:
  [0] MacBook Pro Microphone


In [None]:
#### Initialize PyAudio
audio = pyaudio.PyAudio()
# audio_buffer = deque(28000)

# List input devices
print("Available input devices:")
for i in range(audio.get_device_count()):
    info = audio.get_device_info_by_index(i)
    if info["maxInputChannels"] > 0:
        print(f"  [{i}] {info['name']}")
# Open stream
stream = audio.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

print("\nRecording...")
try:
    while True:
        audio_buffer = stream.read(CHUNK, exception_on_overflow=False)
        # Now process it
        wav_tensor = load_wav_16k_mono_from_buffer(audio_buffer)
        spectrogram = preprocess_from_buffer(wav_tensor)
        spectrogram = tf.expand_dims(spectrogram, axis=0)

        print("Spectrogram shape:", spectrogram.shape)
        interpreter.set_tensor(input_details[0]['index'], spectrogram)
        # Run inference
        interpreter.invoke()
        # Get the output
        output_data = interpreter.get_tensor(output_details[0]['index'])
        output_data_number = [1 if output_data > .5 else 0] 
        print("Output:", output_data, output_data_number)
        # You can now run inference here...
except KeyboardInterrupt:
    print("Stopped by user.")
    stream.stop_stream()
    stream.close()
    p.terminate()

In [27]:
# Load trained Keras model
# -----------------------
MODEL_PATH="/Users/sethwright/Documents/audio-model/output/saved_model.keras"
model = tf.keras.models.load_model(MODEL_PATH, compile =False, safe_mode=False)
print("✅ Model loaded:", MODEL_PATH)

✅ Model loaded: /Users/sethwright/Documents/audio-model/output/saved_model.keras


In [28]:
#run real time predictions with keras model
def test_audio():
    # Initialize PyAudio
    audio = pyaudio.PyAudio() 
    audio_buffer = deque(maxlen=20000)

    # List input devices
    #print("Available input devices:")
    for i in range(audio.get_device_count()):
        info = audio.get_device_info_by_index(i)
        #if info["maxInputChannels"] > 0:
        #print(f"  [{i}] {info['name']}")
    # Open stream
    stream = audio.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)
    print("listening started")
    while True:
        data = stream.read(CHUNK, exception_on_overflow=False)
        samples = np.frombuffer(data, dtype=np.int16).astype(np.float32)
        audio_buffer.extend(samples)

        if len(audio_buffer) == 20000:
            wav_tensor = np.array(audio_buffer, dtype=np.float32)
            spectrogram = preprocess_from_buffer(wav_tensor)
            spectrogram = tf.expand_dims(spectrogram, axis=0)

            pred = model.predict(spectrogram,verbose=0)[0][0]
       


            if pred > 0.9:
                print("Wake word detected!")
                print(f"\nModel raw output: {pred:.4f}")
                playsound("/Users/sethwright/Downloads/gong.mp3")
                stream.stop_stream()
                stream.close()
                audio.terminate()
                break
    

In [35]:
test_audio()

listening started
Wake word detected!

Model raw output: 0.9104
