In [2]:
import json
import glob
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, BatchNormalization, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2
import os

In [None]:
# # Convert dataset from old format (../data/dataset/animal_*/ and ../data/dataset/human_*/)
# # to new format (data/data_label_0*.jsonl and data/data_label_1*.jsonl)
# # Each session directory becomes ONE file with all datapoints, removing timestamp

# import os
# from pathlib import Path

# # Create output directory
# os.makedirs('data', exist_ok=True)

# data_index = 0

# # For Google Colab or other environments, try multiple possible paths
# possible_paths = [
#     os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'dataset')),  # Standard relative
#     '/content/data/dataset',  # Google Colab default
#     os.path.expanduser('~/Workspace/Projects/Agronauts/firmware-sensing-l1-l2/data/dataset'),  # Full home path
#     '../data/dataset',  # Relative from notebook
# ]

# old_data_dir = None
# for path in possible_paths:
#     if os.path.exists(path):
#         old_data_dir = path
#         print(f"Found data directory: {old_data_dir}")
#         break

# if old_data_dir is None:
#     print("ERROR: Could not find data directory!")
#     print(f"Current working directory: {os.getcwd()}")
#     print("Tried paths:")
#     for p in possible_paths:
#         print(f"  - {p} (exists: {os.path.exists(p)})")
# else:
#     # Process animal directories (label=1)
#     animal_dirs = sorted([d for d in os.listdir(old_data_dir) if 'animal' in d.lower() and os.path.isdir(os.path.join(old_data_dir, d))])
#     print(f"Found {len(animal_dirs)} animal sessions")

#     for animal_dir in animal_dirs:
#         data_jsonl_path = os.path.join(old_data_dir, animal_dir, 'data.jsonl')
#         if os.path.exists(data_jsonl_path):
#             output_path = f'data/data_label_1_{data_index}.jsonl'
#             with open(data_jsonl_path, 'r') as f_in:
#                 with open(output_path, 'w') as f_out:
#                     for line in f_in:
#                         # Parse, remove timestamp, write back
#                         data = json.loads(line)
#                         data_no_timestamp = {k: v for k, v in data.items() if k != 'timestamp'}
#                         f_out.write(json.dumps(data_no_timestamp) + '\n')
#             data_index += 1

#     # Process human directories (label=0)
#     human_dirs = sorted([d for d in os.listdir(old_data_dir) if 'human' in d.lower() and os.path.isdir(os.path.join(old_data_dir, d))])
#     print(f"Found {len(human_dirs)} human sessions")

#     for human_dir in human_dirs:
#         data_jsonl_path = os.path.join(old_data_dir, human_dir, 'data.jsonl')
#         if os.path.exists(data_jsonl_path):
#             output_path = f'data/data_label_0_{data_index}.jsonl'
#             with open(data_jsonl_path, 'r') as f_in:
#                 with open(output_path, 'w') as f_out:
#                     for line in f_in:
#                         # Parse, remove timestamp, write back
#                         data = json.loads(line)
#                         data_no_timestamp = {k: v for k, v in data.items() if k != 'timestamp'}
#                         f_out.write(json.dumps(data_no_timestamp) + '\n')
#             data_index += 1

#     print(f"\nConversion complete! Created {data_index} files in data/")
#     print(f"Animal files: data_label_1_*.jsonl")
#     print(f"Human files: data_label_0_*.jsonl")

Found data directory: /Users/wanghley/Workspace/Projects/Agronauts/firmware-sensing-l1-l2/data/dataset
Found 24 animal sessions
Found 18 human sessions

Conversion complete! Created 42 files in data/
Animal files: data_label_1_*.jsonl
Human files: data_label_0_*.jsonl


In [9]:
# --- CONFIGURATION ---
DATA_DIR = "../data/"
CLASS_0_FILES = glob.glob(f"{DATA_DIR}/data_label_0*.jsonl")
CLASS_1_FILES = glob.glob(f"{DATA_DIR}/data_label_1*.jsonl")

print(f"Found {len(CLASS_0_FILES)} class 0 files and {len(CLASS_1_FILES)} class 1 files")

# Keep your length if you want, but 198 is very long for 4Hz (50 seconds).
# If your events are shorter, consider lowering this to 50.
MAX_SEQ_LEN = 198
AUGMENTATION_FACTOR = 50 # <--- NEW: Create 50 fake copies for every 1 real file

def extract_features(line_json):
    """ Turns raw JSON into a compact feature vector (12 features). """
    try:
        data = json.loads(line_json)
        feats = []

        # 1. Thermal (6 values)
        for pos in ['left', 'center', 'right']:
            pixels = np.array(data['thermal'][pos])
            feats.append(np.max(pixels))
            feats.append(np.mean(pixels))

        # 2. Radar (4 values) - Handle both old format (radar.left/right) and new format (mmWave.R1/R2)
        if 'mmWave' in data:
            # New format with mmWave
            r1_energy = data['mmWave']['R1'].get('energy', 0)
            r1_range = data['mmWave']['R1'].get('range', 0)
            r2_energy = data['mmWave']['R2'].get('energy', 0)
            r2_range = data['mmWave']['R2'].get('range', 0)
        else:
            # Old format with radar
            r1_energy = data['radar']['left'].get('energy', 0)
            r1_range = data['radar']['left'].get('range', 0)
            r2_energy = data['radar']['right'].get('energy', 0)
            r2_range = data['radar']['right'].get('range', 0)
        
        feats.append(np.log1p(r1_energy))
        feats.append(r1_range)
        feats.append(np.log1p(r2_energy))
        feats.append(r2_range)

        # 3. Mic (2 values)
        feats.append(data['mic']['left'])
        feats.append(data['mic']['right'])

        return np.array(feats, dtype=np.float32)

    except (KeyError, ValueError, json.JSONDecodeError) as e:
        return None

# --- 1. LOAD REAL DATA ---
print("Loading data files...")
real_X = []
real_y = []

# Load Class 0
for f_path in CLASS_0_FILES:
    print(f"  Loading {f_path}...")
    with open(f_path, 'r') as f:
        seq = [extract_features(line) for line in f]
        seq = [s for s in seq if s is not None]
        if len(seq) > 0:
            real_X.append(np.array(seq))
            real_y.append(0.0)
            print(f"    Loaded {len(seq)} frames")

# Load Class 1
for f_path in CLASS_1_FILES:
    print(f"  Loading {f_path}...")
    with open(f_path, 'r') as f:
        seq = [extract_features(line) for line in f]
        seq = [s for s in seq if s is not None]
        if len(seq) > 0:
            real_X.append(np.array(seq))
            real_y.append(1.0)
            print(f"    Loaded {len(seq)} frames")

if len(real_X) == 0:
    print("ERROR: No data loaded! Check your data files.")
else:
    # Pad to fixed length
    X_padded_real = pad_sequences(real_X, maxlen=MAX_SEQ_LEN, dtype='float32', padding='pre', truncating='post')
    y_real = np.array(real_y)

    print(f"Loaded {len(X_padded_real)} real samples.")

Found 23 class 0 files and 28 class 1 files
Loading data files...
  Loading ../data/data_label_0_24.jsonl...
    Loaded 38 frames
  Loading ../data/data_label_0_26.jsonl...
    Loaded 36 frames
  Loading ../data/data_label_0_39.jsonl...
    Loaded 37 frames
  Loading ../data/data_label_0_41.jsonl...
    Loaded 38 frames
  Loading ../data/data_label_0_27.jsonl...
    Loaded 37 frames
  Loading ../data/data_label_0_20251116_160914.jsonl...
    Loaded 206 frames
  Loading ../data/data_label_0_25.jsonl...
    Loaded 37 frames
  Loading ../data/data_label_0_40.jsonl...
    Loaded 39 frames
  Loading ../data/data_label_0_38.jsonl...
    Loaded 38 frames
  Loading ../data/data_label_0_36.jsonl...
    Loaded 36 frames
  Loading ../data/data_label_0_20251116_144753.jsonl...
    Loaded 203 frames
  Loading ../data/data_label_0_34.jsonl...
    Loaded 37 frames
  Loading ../data/data_label_0_20251116_143649.jsonl...
    Loaded 207 frames
  Loading ../data/data_label_0_29.jsonl...
    Loaded 36 fra

In [10]:

# --- 2. AUGMENTATION ENGINE (THE MISSING PIECE) ---
print(f"Generating synthetic data (x{AUGMENTATION_FACTOR})...")
aug_X = []
aug_y = []

for i in range(len(X_padded_real)):
    original_sample = X_padded_real[i] # Shape (198, 12)
    label = y_real[i]

    # Add the original
    aug_X.append(original_sample)
    aug_y.append(label)

    # Create clones with random noise
    for _ in range(AUGMENTATION_FACTOR):
        # Add random jitter (Gaussian noise)
        noise = np.random.normal(0, 0.05, original_sample.shape)

        # Scale slightly (simulate hotter/colder environment)
        scale = np.random.uniform(0.95, 1.05)

        new_sample = (original_sample * scale) + noise
        aug_X.append(new_sample)
        aug_y.append(label)

X_final = np.array(aug_X, dtype=np.float32)
y_final = np.array(aug_y, dtype=np.float32)

print(f"Total Training Samples after Augmentation: {len(X_final)}")

Generating synthetic data (x50)...
Total Training Samples after Augmentation: 2601
Total Training Samples after Augmentation: 2601


In [11]:

    # --- 3. NORMALIZE ---
    # Calculate stats on the AUGMENTED data
    X_flat = X_final.reshape(-1, 12)
    mean_vals = np.mean(X_flat, axis=0)
    std_vals = np.std(X_flat, axis=0) + 0.0001

    X_norm = (X_final - mean_vals) / std_vals

    # Print for ESP32
    print("\n" + "="*40)
    print("COPY THIS INTO YOUR ESP32 CODE")
    print("="*40)
    print(f"const float MEAN_VALS[] = {{ {', '.join([f'{x:.4f}' for x in mean_vals])} }};")
    print(f"const float STD_VALS[]  = {{ {', '.join([f'{x:.4f}' for x in std_vals])} }};")
    print("="*40 + "\n")


COPY THIS INTO YOUR ESP32 CODE
const float MEAN_VALS[] = { 7.5716, 6.6067, 7.6019, 6.6546, 8.8731, 6.7688, 0.3585, 0.0220, 2.4497, 1.4558, 0.0004, 0.0005 };
const float STD_VALS[]  = { 11.5571, 10.4781, 11.6282, 10.5532, 14.0965, 10.7311, 2.3099, 0.2908, 5.4092, 3.7156, 0.0496, 0.0496 };



In [13]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X_norm, y_final, test_size=0.2, random_state=42, shuffle=True)

# --- 4. MODEL ---
model = Sequential([
    Input(shape=(MAX_SEQ_LEN, 12)),

    # Slightly simpler model for stability
    Dense(16, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.2),

    GlobalMaxPooling1D(),

    Dropout(0.2),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss='binary_crossentropy',
                metrics=['accuracy'])

In [15]:
# Train
print("Starting Training...")
model.fit(X_train, y_train, epochs=15, batch_size=16, validation_data=(X_test, y_test))

Starting Training...
Epoch 1/15
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8058 - loss: 0.4808 - val_accuracy: 0.9002 - val_loss: 0.3743
Epoch 2/15
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8058 - loss: 0.4808 - val_accuracy: 0.9002 - val_loss: 0.3743
Epoch 2/15
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8197 - loss: 0.4380 - val_accuracy: 0.8906 - val_loss: 0.3665
Epoch 3/15
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8197 - loss: 0.4380 - val_accuracy: 0.8906 - val_loss: 0.3665
Epoch 3/15
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8293 - loss: 0.4246 - val_accuracy: 0.8906 - val_loss: 0.3574
Epoch 4/15
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8293 - loss: 0.4246 - val_accuracy: 0.8906 - val_loss: 0.3574
Epoch 4

<keras.src.callbacks.history.History at 0x152aaa560>

In [16]:
# --- 5. EXPORT ---
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]

def representative_dataset():
    for i in range(min(100, len(X_train))):
        yield [X_train[i].reshape(1, MAX_SEQ_LEN, 12).astype(np.float32)]

converter.representative_dataset = representative_dataset
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

tflite_model = converter.convert()

with open('model.tflite', 'wb') as f:
    f.write(tflite_model)

print("Done! Download model.tflite and convert with xxd.")

INFO:tensorflow:Assets written to: /var/folders/r5/m5977dy13hd_9klxgsmylm740000gn/T/tmpd8glf5rv/assets


INFO:tensorflow:Assets written to: /var/folders/r5/m5977dy13hd_9klxgsmylm740000gn/T/tmpd8glf5rv/assets


Saved artifact at '/var/folders/r5/m5977dy13hd_9klxgsmylm740000gn/T/tmpd8glf5rv'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 198, 12), dtype=tf.float32, name='keras_tensor')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  5681495056: TensorSpec(shape=(), dtype=tf.resource, name=None)
  5681484320: TensorSpec(shape=(), dtype=tf.resource, name=None)
  5681488720: TensorSpec(shape=(), dtype=tf.resource, name=None)
  5681494880: TensorSpec(shape=(), dtype=tf.resource, name=None)
  5681488896: TensorSpec(shape=(), dtype=tf.resource, name=None)
  5681493296: TensorSpec(shape=(), dtype=tf.resource, name=None)
  5681491888: TensorSpec(shape=(), dtype=tf.resource, name=None)
  5681795264: TensorSpec(shape=(), dtype=tf.resource, name=None)
  5681492592: TensorSpec(shape=(), dtype=tf.resource, name=None)
  5681791392: TensorSpec(shape=(), dtype=tf.resource, name=None)


W0000 00:00:1763655801.226263  633305 tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
W0000 00:00:1763655801.226276  633305 tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2025-11-20 11:23:21.226646: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /var/folders/r5/m5977dy13hd_9klxgsmylm740000gn/T/tmpd8glf5rv
2025-11-20 11:23:21.227269: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-11-20 11:23:21.227277: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /var/folders/r5/m5977dy13hd_9klxgsmylm740000gn/T/tmpd8glf5rv
I0000 00:00:1763655801.232449  633305 mlir_graph_optimization_pass.cc:437] MLIR V1 optimization pass is not enabled
2025-11-20 11:23:21.233389: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-11-20 11:23:21.268762: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /var/folder

Done! Download model.tflite and convert with xxd.


fully_quantize: 0, inference_type: 6, input_inference_type: INT8, output_inference_type: INT8
2025-11-20 11:23:21.880187: W tensorflow/compiler/mlir/lite/flatbuffer_export.cc:3705] Skipping runtime version metadata in the model. This will be generated by the exporter.


In [17]:
!xxd -i model.tflite > model.h