In [80]:
import json
import glob
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2

In [81]:
DATA_DIR = "data/" # e.g., "data/"
CLASS_0_FILES = glob.glob(f"{DATA_DIR}/data_label_0*.jsonl")
CLASS_1_FILES = glob.glob(f"{DATA_DIR}/data_label_1*.jsonl")

In [82]:
MAX_SEQ_LEN = 198  # e.g., 25 seconds at 4Hz
NUM_FEATURES = 202 # Adjust based on your final feature set

In [83]:
def extract_features(line_json):
    """
    Turns raw JSON into a compact feature vector.
    WE MUST REPLICATE THIS LOGIC EXACTLY IN C++ LATER.
    """
    try:
        data = json.loads(line_json)
        feats = []

        # --- 1. Thermal Sensors (Reduce 192 pixels to 6 values) ---
        # We care about: Is there a hot spot? (Max) Is the whole area hot? (Mean)
        for pos in ['left', 'center', 'right']:
            pixels = np.array(data['thermal'][pos])
            feats.append(np.max(pixels))  # Feature 0, 2, 4: Max Temp
            feats.append(np.mean(pixels)) # Feature 1, 3, 5: Avg Temp

        # --- 2. Radar (Log Scale) ---
        # Energy is huge (3,000,000), so we use log() to squash it to ~15.0
        feats.append(np.log1p(data['radar']['left']['energy']))
        feats.append(data['radar']['left']['range'])

        feats.append(np.log1p(data['radar']['right']['energy']))
        feats.append(data['radar']['right']['range'])

        # --- 3. Mic ---
        feats.append(data['mic']['left'])
        feats.append(data['mic']['right'])

        # Total Features = 6 (Thermal) + 4 (Radar) + 2 (Mic) = 12
        return np.array(feats, dtype=np.float32)

    except (KeyError, ValueError, json.JSONDecodeError):
        return None

In [84]:
# --- Load Data ---
all_X = []
all_y = []

# Load Class 0
for f_path in CLASS_0_FILES:
    with open(f_path, 'r') as f:
        seq = [extract_features(line) for line in f]
        seq = [s for s in seq if s is not None] # Remove errors
        if len(seq) > 0:
            all_X.append(np.array(seq))
            all_y.append(0)

# Load Class 1
for f_path in CLASS_1_FILES:
    with open(f_path, 'r') as f:
        seq = [extract_features(line) for line in f]
        seq = [s for s in seq if s is not None]
        if len(seq) > 0:
            all_X.append(np.array(seq))
            all_y.append(1)

# --- Pad Sequences ---
# Makes every recording exactly 20 steps long
X_padded = pad_sequences(all_X, maxlen=MAX_SEQ_LEN, dtype='float32', padding='pre', truncating='post')
y = np.array(all_y)

# --- Normalize (CRITICAL FOR ESP32) ---
# Calculate Mean and Std on the flattened data
# We use manual calculation so we can print it easily for C++
X_flat = np.concatenate(X_padded, axis=0)
mean_vals = np.mean(X_flat, axis=0)
std_vals = np.std(X_flat, axis=0) + 0.0001 # Avoid divide by zero

# Apply Normalization
X_norm = (X_padded - mean_vals) / std_vals

# --- PRINT VALUES FOR C++ ---
print("\n" + "="*40)
print("COPY THIS INTO YOUR ESP32 CODE")
print("="*40)
print(f"const float MEAN_VALS[] = {{ {', '.join([f'{x:.4f}' for x in mean_vals])} }};")
print(f"const float STD_VALS[]  = {{ {', '.join([f'{x:.4f}' for x in std_vals])} }};")
print("="*40 + "\n")

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42, shuffle=True)


COPY THIS INTO YOUR ESP32 CODE
const float MEAN_VALS[] = { 26.8168, 23.8184, 27.1421, 24.3267, 34.6056, 24.6114, 1.8039, 0.1024, 8.8036, 5.0907, 0.0019, 0.0022 };
const float STD_VALS[]  = { 3.5088, 1.2978, 3.1518, 1.6688, 5.4812, 1.5281, 4.9213, 0.6076, 7.0024, 5.2913, 0.0023, 0.0026 };



In [85]:
model = Sequential([
    Input(shape=(MAX_SEQ_LEN, 12)), # 12 Features per step

    # Simple Dense layer per timestep
    Dense(16, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),

    # Find the strongest signal across the time sequence
    GlobalMaxPooling1D(),

    # Final Decision
    Dropout(0.3),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train
model.fit(X_train, y_train, epochs=60, batch_size=8, validation_data=(X_test, y_test))

# Save and Convert (Quantized)
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]

# Representative dataset for quantization
def representative_dataset():
    for i in range(min(100, len(X_train))):
        yield [X_train[i].reshape(1, MAX_SEQ_LEN, 12).astype(np.float32)]

converter.representative_dataset = representative_dataset
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

tflite_model = converter.convert()

# Write to file
with open('model.tflite', 'wb') as f:
    f.write(tflite_model)

print("Model saved as model.tflite. Run 'xxd -i model.tflite > model.h' to convert.")

Epoch 1/60
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.5714 - loss: 3.7691 - val_accuracy: 0.5000 - val_loss: 2.2453
Epoch 2/60
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 353ms/step - accuracy: 0.5714 - loss: 1.9429 - val_accuracy: 0.5000 - val_loss: 2.1945
Epoch 3/60
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 278ms/step - accuracy: 0.5714 - loss: 3.0648 - val_accuracy: 0.5000 - val_loss: 2.1431
Epoch 4/60
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 227ms/step - accuracy: 0.5714 - loss: 3.0215 - val_accuracy: 0.5000 - val_loss: 2.0936
Epoch 5/60
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 290ms/step - accuracy: 0.5714 - loss: 3.6737 - val_accuracy: 0.5000 - val_loss: 2.0438
Epoch 6/60
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 253ms/step - accuracy: 0.5714 - loss: 1.0288 - val_accuracy: 0.5000 - val_loss: 1.9979
Epoch 7/60
[1m1/1[0m [32m━━━━━━━━━━━━━━━



In [89]:
model.predict(X_test[1].reshape(1, MAX_SEQ_LEN, 12))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step


array([[0.76173896]], dtype=float32)

In [88]:
y_test

array([1, 0])

In [91]:
import json
import glob
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, BatchNormalization, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2

# --- CONFIGURATION ---
DATA_DIR = "data/"
CLASS_0_FILES = glob.glob(f"{DATA_DIR}/data_label_0*.jsonl")
CLASS_1_FILES = glob.glob(f"{DATA_DIR}/data_label_1*.jsonl")

# Keep your length if you want, but 198 is very long for 4Hz (50 seconds).
# If your events are shorter, consider lowering this to 50.
MAX_SEQ_LEN = 198
AUGMENTATION_FACTOR = 50 # <--- NEW: Create 50 fake copies for every 1 real file

def extract_features(line_json):
    """ Turns raw JSON into a compact feature vector (12 features). """
    try:
        data = json.loads(line_json)
        feats = []

        # 1. Thermal (6 values)
        for pos in ['left', 'center', 'right']:
            pixels = np.array(data['thermal'][pos])
            feats.append(np.max(pixels))
            feats.append(np.mean(pixels))

        # 2. Radar (4 values)
        feats.append(np.log1p(data['radar']['left']['energy']))
        feats.append(data['radar']['left']['range'])
        feats.append(np.log1p(data['radar']['right']['energy']))
        feats.append(data['radar']['right']['range'])

        # 3. Mic (2 values)
        feats.append(data['mic']['left'])
        feats.append(data['mic']['right'])

        return np.array(feats, dtype=np.float32)

    except (KeyError, ValueError, json.JSONDecodeError):
        return None

# --- 1. LOAD REAL DATA ---
print("Loading data files...")
real_X = []
real_y = []

# Load Class 0
for f_path in CLASS_0_FILES:
    with open(f_path, 'r') as f:
        seq = [extract_features(line) for line in f]
        seq = [s for s in seq if s is not None]
        if len(seq) > 0:
            real_X.append(np.array(seq))
            real_y.append(0.0)

# Load Class 1
for f_path in CLASS_1_FILES:
    with open(f_path, 'r') as f:
        seq = [extract_features(line) for line in f]
        seq = [s for s in seq if s is not None]
        if len(seq) > 0:
            real_X.append(np.array(seq))
            real_y.append(1.0)

# Pad to fixed length
X_padded_real = pad_sequences(real_X, maxlen=MAX_SEQ_LEN, dtype='float32', padding='pre', truncating='post')
y_real = np.array(real_y)

print(f"Loaded {len(X_padded_real)} real samples.")

# --- 2. AUGMENTATION ENGINE (THE MISSING PIECE) ---
print(f"Generating synthetic data (x{AUGMENTATION_FACTOR})...")
aug_X = []
aug_y = []

for i in range(len(X_padded_real)):
    original_sample = X_padded_real[i] # Shape (198, 12)
    label = y_real[i]

    # Add the original
    aug_X.append(original_sample)
    aug_y.append(label)

    # Create clones with random noise
    for _ in range(AUGMENTATION_FACTOR):
        # Add random jitter (Gaussian noise)
        noise = np.random.normal(0, 0.05, original_sample.shape)

        # Scale slightly (simulate hotter/colder environment)
        scale = np.random.uniform(0.95, 1.05)

        new_sample = (original_sample * scale) + noise
        aug_X.append(new_sample)
        aug_y.append(label)

X_final = np.array(aug_X, dtype=np.float32)
y_final = np.array(aug_y, dtype=np.float32)

print(f"Total Training Samples after Augmentation: {len(X_final)}")

# --- 3. NORMALIZE ---
# Calculate stats on the AUGMENTED data
X_flat = X_final.reshape(-1, 12)
mean_vals = np.mean(X_flat, axis=0)
std_vals = np.std(X_flat, axis=0) + 0.0001

X_norm = (X_final - mean_vals) / std_vals

# Print for ESP32
print("\n" + "="*40)
print("COPY THIS INTO YOUR ESP32 CODE")
print("="*40)
print(f"const float MEAN_VALS[] = {{ {', '.join([f'{x:.4f}' for x in mean_vals])} }};")
print(f"const float STD_VALS[]  = {{ {', '.join([f'{x:.4f}' for x in std_vals])} }};")
print("="*40 + "\n")

# Split
X_train, X_test, y_train, y_test = train_test_split(X_norm, y_final, test_size=0.2, random_state=42, shuffle=True)

# --- 4. MODEL ---
model = Sequential([
    Input(shape=(MAX_SEQ_LEN, 12)),

    # Slightly simpler model for stability
    Dense(16, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.2),

    GlobalMaxPooling1D(),

    Dropout(0.2),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train
print("Starting Training...")
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))

# --- 5. EXPORT ---
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]

def representative_dataset():
    for i in range(min(100, len(X_train))):
        yield [X_train[i].reshape(1, MAX_SEQ_LEN, 12).astype(np.float32)]

converter.representative_dataset = representative_dataset
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

tflite_model = converter.convert()

with open('model.tflite', 'wb') as f:
    f.write(tflite_model)

print("Done! Download model.tflite and convert with xxd.")

Loading data files...
Loaded 9 real samples.
Generating synthetic data (x50)...
Total Training Samples after Augmentation: 459

COPY THIS INTO YOUR ESP32 CODE
const float MEAN_VALS[] = { 26.7362, 23.7486, 27.0604, 24.2537, 34.5040, 24.5382, 1.7974, 0.1017, 8.7762, 5.0751, 0.0020, 0.0023 };
const float STD_VALS[]  = { 3.5612, 1.4530, 3.2117, 1.7664, 5.5560, 1.6480, 4.9065, 0.6057, 6.9886, 5.2805, 0.0500, 0.0498 };

Starting Training...
Epoch 1/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.5530 - loss: 3.4067 - val_accuracy: 0.5435 - val_loss: 1.3607
Epoch 2/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5702 - loss: 1.7164 - val_accuracy: 0.5326 - val_loss: 0.7876
Epoch 3/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5367 - loss: 0.9898 - val_accuracy: 0.6304 - val_loss: 0.5273
Epoch 4/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/s



In [93]:
model.predict(X_test[1].reshape(1, MAX_SEQ_LEN, 12))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step


array([[0.11562948]], dtype=float32)

In [95]:
# calculate accuracy
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Accuracy: 0.9347826086956522


In [96]:
!xxd -i model.tflite > model.h