In [1]:
import os
import random
import glob
import numpy as np
import tensorflow as tf
import soundfile as sf
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from pathlib import Path

In [19]:
"""
train_wakeword.py

Usage:
    python train_wakeword.py


Notes:
- Assumes data/positive/*.wav and data/negative/*.wav exist (16 kHz mono but code will resample/pad if needed).
- Adjust hyperparameters like N_EPOCHS, BATCH_SIZE, NUM_MELS to taste.
"""

# -----------------------
# CONFIG
#Very important to reuse these exact settings for all predictions
# -----------------------
SAMPLE_RATE = 16000
DURATION_SECONDS = 1.00        
NUM_SAMPLES = int(SAMPLE_RATE * DURATION_SECONDS)
NUM_MELS = 40
FRAME_LENGTH = 400
FRAME_STEP = 160                    # 10 ms step
FFT_LENGTH = 512
FMIN = 80.0
FMAX = 7600.0
#Need to tweak to prevent over fitting
BATCH_SIZE = 64
N_EPOCHS = 30
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE
#Change to your path
DATA_DIR = "/Users/sethwright/Documents/audio-model/data"
POS_DIR = os.path.join(DATA_DIR, "Training_POS")
NEG_DIR = os.path.join(DATA_DIR, "Training_NEG")

OUT_DIR = "output"
os.makedirs(OUT_DIR, exist_ok=True)



In [20]:
# -----------------------
# 1️⃣ FILE COLLECTION & DATA SPLIT
# -----------------------
pos_files = sorted(glob.glob(os.path.join(POS_DIR, "*.wav")))
neg_files = sorted(glob.glob(os.path.join(NEG_DIR, "*.wav")))

if not pos_files or not neg_files:
    raise SystemExit(f"No .wav files found in {POS_DIR} or {NEG_DIR}.")

all_files = pos_files + neg_files
all_labels = [1]*len(pos_files) + [0]*len(neg_files)  # 1=positive (wake word), 0=negative

# Train/test split
train_files, test_files, train_labels, test_labels = train_test_split(
    all_files, all_labels, test_size=0.15, stratify=all_labels, random_state=SEED)

# Train/val split
train_files, val_files, train_labels, val_labels = train_test_split(
    train_files, train_labels, test_size=0.1, stratify=train_labels, random_state=SEED)

print(f"Files: total={len(all_files)}, train={len(train_files)}, val={len(val_files)}, test={len(test_files)}")

# -----------------------
# 2️⃣ PREPROCESS HELPERS
# -----------------------
'''
def load_and_fix_length(path, target_sr=SAMPLE_RATE, target_len=NUM_SAMPLES):
    wav, sr = sf.read(path, dtype='float32')
    if wav.ndim > 1:
        wav = np.mean(wav, axis=1)
    if sr != target_sr:
        wav = tf.signal.resample(wav, int(len(wav) * target_sr / sr)).numpy()
    if len(wav) > target_len:
        start = random.randint(0, len(wav) - target_len)
        wav = wav[start:start + target_len]
    elif len(wav) < target_len:
        pad = target_len - len(wav)
        left = pad // 2
        right = pad - left
        wav = np.pad(wav, (left, right), mode='constant')
    return wav.astype(np.float32) '''
def load_and_fix_length(path, target_sr=SAMPLE_RATE, target_len=NUM_SAMPLES):
    try:
        wav, sr = sf.read(path, dtype='float32')
    except Exception as e:
        print(f"[WARN] Failed to read {path}: {e}. Replacing with silence.")
        wav = np.zeros(target_len, dtype=np.float32)
        sr = target_sr

    if wav.ndim > 1:
        wav = np.mean(wav, axis=1)

    # Resample if needed
    if sr != target_sr:
        try:
            wav = tf.signal.resample(wav, int(len(wav) * target_sr / sr)).numpy()
        except Exception as e:
            print(f"[WARN] Resample failed for {path}: {e}. Using silence.")
            wav = np.zeros(target_len, dtype=np.float32)

    # Fix length (crop or pad)
    if len(wav) > target_len:
        start = random.randint(0, len(wav) - target_len)
        wav = wav[start:start + target_len]
    elif len(wav) < target_len:
        pad = target_len - len(wav)
        left = pad // 2
        right = pad - left
        wav = np.pad(wav, (left, right), mode='constant')

    return wav.astype(np.float32)

def waveform_to_log_mel(waveform):
    x = tf.convert_to_tensor(waveform, dtype=tf.float32)
    x = tf.reshape(x, [NUM_SAMPLES])
    x = tf.concat([x[:1], x[1:] - 0.97 * x[:-1]], 0)
    stft = tf.signal.stft(
        x, frame_length=FRAME_LENGTH, frame_step=FRAME_STEP,
        fft_length=FFT_LENGTH, window_fn=tf.signal.hann_window)
    mag = tf.abs(stft)
    num_spectrogram_bins = mag.shape[-1]
    mel_weight = tf.signal.linear_to_mel_weight_matrix(
        NUM_MELS, num_spectrogram_bins, SAMPLE_RATE, FMIN, FMAX)
    mel = tf.matmul(mag, mel_weight)
    log_mel = tf.math.log(mel + 1e-6)
    mean = tf.math.reduce_mean(log_mel)
    std = tf.math.reduce_std(log_mel) + 1e-6
    log_mel = (log_mel - mean) / std
    return log_mel  # (time, mels)

def gen(files, labels):
    for p, l in zip(files, labels):
        yield p.encode("utf-8"), np.int64(l)

def _parse(path_bytes, label):
    path = path_bytes.numpy().decode("utf-8")
    wav = load_and_fix_length(path)
    spec = waveform_to_log_mel(wav).numpy()
    spec = np.expand_dims(spec, axis=-1).astype(np.float32)
    return spec, label

def tf_parse(path, label):
    spec, lab = tf.py_function(_parse, [path, label], [tf.float32, tf.int64])
    time_frames = 1 + (NUM_SAMPLES - FRAME_LENGTH) // FRAME_STEP
    spec.set_shape([time_frames, NUM_MELS, 1])
    lab = tf.cast(lab, tf.int64)
    lab.set_shape([])
    return spec, lab



# -----------------------
# 3️⃣ DATASET BUILDERS
# -----------------------
def make_train_dataset(files, labels, batch_size):
    ds = tf.data.Dataset.from_tensor_slices(
        (files, tf.cast(labels, tf.int64))   # ✅ force int64 here
    )
    ds = ds.shuffle(buffer_size=len(files), seed=SEED)
    ds = ds.map(tf_parse, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(batch_size).prefetch(AUTOTUNE)
    return ds


def make_dataset(files, labels, shuffle=False):
    ds = tf.data.Dataset.from_generator(
        lambda: gen(files, labels),
        output_signature=(
            tf.TensorSpec(shape=(), dtype=tf.string),
            tf.TensorSpec(shape=(), dtype=tf.int64))
    )
    if shuffle:
        ds = ds.shuffle(buffer_size=len(files), seed=SEED)
    ds = ds.map(tf_parse, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

# -----------------------
# 4️⃣ BUILD TRAIN / VAL / TEST DATASETS
# -----------------------
train_ds = make_train_dataset(train_files, train_labels, batch_size=BATCH_SIZE)
val_ds   = make_dataset(val_files, val_labels)
test_ds  = make_dataset(test_files, test_labels)

# -----------------------
# 5️⃣ CLASS WEIGHTS (10:1)
# label 1 = wake word, 0 = negative
# -----------------------
class_weights = {0: 1.0, 1: 3.0}
print("Class weights:", class_weights)

Files: total=36037, train=27567, val=3064, test=5406
Class weights: {0: 1.0, 1: 3.0}


In [21]:

# -----------------------
# MODEL
# -----------------------
time_frames = 1 + (NUM_SAMPLES - FRAME_LENGTH) // FRAME_STEP
input_shape = (time_frames, NUM_MELS, 1)

def build_model(input_shape):
    #logistic regression model
    inputs = tf.keras.Input(shape=input_shape)
    x = tf.keras.layers.Conv2D(8, (3,3), padding='same', activation='relu')(inputs)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPool2D((2,2))(x)

    x = tf.keras.layers.Conv2D(16, (3,3), padding='same', activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPool2D((2,2))(x)

    x = tf.keras.layers.Conv2D(32, (3,3), padding='same', activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPool2D((2,2))(x)

    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    return tf.keras.Model(inputs, outputs)

model = build_model(input_shape)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
model.summary()

# -----------------------
# TRAIN
# -----------------------
# -----------------------
# TRAIN
# -----------------------
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    ),

    # ✅ Updated to save in .keras format (no unsupported args)
    tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(OUT_DIR, "best_model"),
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=False,   # must be False to save full model
    )
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=N_EPOCHS,
    class_weight=class_weights,   # ✅ apply 10:1 weighting
    callbacks=callbacks,          # ✅ early stopping, learning rate, etc.
)

# -----------------------
# EVALUATE
# -----------------------
print("\nEvaluation on test set:")
res = model.evaluate(test_ds)
print("Test results (loss, accuracy, auc):", res)

y_true, y_pred = [], []
for x_batch, y_batch in test_ds:
    preds = model.predict(x_batch)
    y_true.extend(y_batch.numpy())
    y_pred.extend(preds.flatten())

y_pred_bin = [1 if p >= 0.5 else 0 for p in y_pred]
from sklearn.metrics import confusion_matrix, classification_report
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred_bin))
print("\nClassification Report:")
print(classification_report(y_true, y_pred_bin, digits=4))

# -----------------------
# SAVE MODELS
# -----------------------
saved_model_dir = os.path.join(OUT_DIR, "saved_model3.keras")
model.save(saved_model_dir)  # ✅ new API for TensorFlow SavedModel export
print("✅ Exported TensorFlow SavedModel to:", saved_model_dir)






Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 98, 40, 1)]       0         
                                                                 
 conv2d_6 (Conv2D)           (None, 98, 40, 8)         80        
                                                                 
 batch_normalization_6 (Bat  (None, 98, 40, 8)         32        
 chNormalization)                                                
                                                                 
 max_pooling2d_6 (MaxPoolin  (None, 49, 20, 8)         0         
 g2D)                                                            
                                                                 
 conv2d_7 (Conv2D)           (None, 49, 20, 16)        1168      
                                                                 
 batch_normalization_7 (Bat  (None, 49, 20, 16)        64  

INFO:tensorflow:Assets written to: output/best_model/assets


Epoch 2/30


INFO:tensorflow:Assets written to: output/best_model/assets


Epoch 3/30
Epoch 4/30
Epoch 5/30


INFO:tensorflow:Assets written to: output/best_model/assets


Epoch 6/30
Epoch 7/30
Epoch 8/30


INFO:tensorflow:Assets written to: output/best_model/assets


Epoch 9/30
Epoch 10/30
Epoch 11/30

Evaluation on test set:
Test results (loss, accuracy, auc): [0.06468230485916138, 0.979097306728363, 0.9869830012321472]

Confusion Matrix:
[[4817   71]
 [  46  472]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9905    0.9855    0.9880      4888
           1     0.8692    0.9112    0.8897       518

    accuracy                         0.9784      5406
   macro avg     0.9299    0.9483    0.9389      5406
weighted avg     0.9789    0.9784    0.9786      5406

✅ Exported TensorFlow SavedModel to: output/saved_model3.keras


In [None]:
# -----------------------
# CONVERT TO TFLITE (INT8)
# -----------------------
def representative_dataset_gen():
    for i, path in enumerate(train_files[:200]):
        wav = load_and_fix_length(path)
        spec = waveform_to_log_mel(wav).numpy().astype(np.float32)
        spec = np.expand_dims(spec, (0, -1))
        yield [spec]

converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

tflite_model = converter.convert()
tflite_path = os.path.join(OUT_DIR, "heychef_int8.tflite")
with open(tflite_path, "wb") as f:
    f.write(tflite_model)
print("Saved quantized TFLite model to:", tflite_path)

print("\n✅ All done. Models and logs are in:", OUT_DIR)

In [24]:
# -----------------------
# CONVERT TO TFLITE (FLOAT32)
# -----------------------

keras_model_path = "output/saved_model3.keras"  # folder, not a file
converter = tf.lite.TFLiteConverter.from_saved_model(keras_model_path)

# Keep it as float32 for now (no quantization)
converter.optimizations = []  # no quantization
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]

tflite_model = converter.convert()

tflite_path = os.path.join(OUT_DIR, "Sheila_float32.tflite")
with open(tflite_path, "wb") as f:
    f.write(tflite_model)

print("✅ Saved FLOAT32 TFLite model to:", tflite_path)

OSError: SavedModel file does not exist at: output/saved_model3.keras/{saved_model.pbtxt|saved_model.pb}

In [27]:
# Load trained Keras model
# -----------------------
saved_model_dir = "output/saved_model3"  # no .keras
model.save(saved_model_dir, save_format="tf")
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
print("✅ Model loaded:")
# Convert directly from Keras model in memory


converter.optimizations = []  # float32
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]

tflite_model = converter.convert()

tflite_path = os.path.join(OUT_DIR, "sheila_float32.tflite")
with open(tflite_path, "wb") as f:
    f.write(tflite_model)

print("✅ Saved FLOAT32 TFLite model to:", tflite_path)


INFO:tensorflow:Assets written to: output/saved_model3/assets


INFO:tensorflow:Assets written to: output/saved_model3/assets


✅ Model loaded:


2025-11-23 17:58:44.681742: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2025-11-23 17:58:44.681761: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.


✅ Saved FLOAT32 TFLite model to: output/sheila_float32.tflite


2025-11-23 17:58:44.681885: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: output/saved_model3
2025-11-23 17:58:44.683059: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve }
2025-11-23 17:58:44.683064: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: output/saved_model3
2025-11-23 17:58:44.686357: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle.
2025-11-23 17:58:44.738560: I tensorflow/cc/saved_model/loader.cc:215] Running initialization op on SavedModel bundle at path: output/saved_model3
2025-11-23 17:58:44.751050: I tensorflow/cc/saved_model/loader.cc:314] SavedModel load for tags { serve }; Status: success: OK. Took 69168 microseconds.
