In [19]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import ast
import time
from scipy.fftpack import fft, fftfreq
from scipy.signal import spectrogram
import io, base64
from PIL import Image
import cv2
from ultralytics import YOLO

In [20]:
# Cell 2: Audio Transformation

def compute_fft(samples, sampling_rate=8000):
    samples = np.array(samples)
    n = len(samples)
    freqs = np.fft.rfftfreq(n, d=1/sampling_rate)
    magnitudes = np.abs(np.fft.rfft(samples))
    return {'freqs': freqs, 'magnitudes': magnitudes}


def classify_voice_or_noise(freqs, magnitudes, voice_freq_range=(500, 3500), energy_threshold=15000000):
    freqs = np.array(freqs)
    magnitudes = np.array(magnitudes)
    mask = (freqs >= voice_freq_range[0]) & (freqs <= voice_freq_range[1])
    energy = magnitudes[mask].sum()
    return 'Voice' if energy > energy_threshold else 'Noise'


def detect_cat_voice(classification, freqs, magnitudes, freq_range=(400, 700), harmonic_range=(200, 1000), harmonic_threshold=15):
    if classification != 'Voice':
        return 0
    freqs = np.array(freqs)
    magnitudes = np.array(magnitudes)
    primary = magnitudes[(freqs >= freq_range[0]) & (freqs <= freq_range[1])]
    if primary.size == 0:
        return 0
    peak = primary.max()
    harmonic = magnitudes[(freqs >= harmonic_range[0]) & (freqs <= harmonic_range[1])].sum()
    return 1 if (harmonic / peak) < harmonic_threshold else 0


def detect_human_voice(classification, freqs, magnitudes, freq_range=(150, 600), harmonic_range=(2000, 4000), harmonic_threshold=15):
    if classification != 'Voice':
        return 0
    freqs = np.array(freqs)
    magnitudes = np.array(magnitudes)
    primary = magnitudes[(freqs >= freq_range[0]) & (freqs <= freq_range[1])]
    if primary.size == 0:
        return 0
    norm_energy = (primary**2).mean() / (magnitudes**2).max()
    harmonic = magnitudes[(freqs >= harmonic_range[0]) & (freqs <= harmonic_range[1])].sum()
    return 1 if (norm_energy > 0.1 and harmonic > harmonic_threshold) else 0


def calculate_meow_loudness(is_cat, magnitudes):
    if is_cat != 1:
        return 'none'
    avg = np.array(magnitudes).mean()
    if avg < 40000:
        return 'low'
    if avg <= 60000:
        return 'medium'
    return 'high'


def calculate_dominant_frequency(freqs, magnitudes):
    freqs = np.array(freqs)
    magnitudes = np.array(magnitudes)
    if freqs.size == 0 or magnitudes.size == 0:
        return 0
    return freqs[np.argmax(magnitudes)]


def transform_audio(path):
    df = pd.read_csv(path)
    df['audio_samples'] = df['audio_samples'].apply(ast.literal_eval)
    features = df[['frame_id', 'timestamp']].copy()
    fft_res = df['audio_samples'].apply(compute_fft)
    features['fft_result'] = fft_res
    features['classification'] = fft_res.apply(lambda x: classify_voice_or_noise(x['freqs'], x['magnitudes']))
    features['is_cat_voice'] = features.apply(
        lambda r: detect_cat_voice(r['classification'], r['fft_result']['freqs'], r['fft_result']['magnitudes']),
        axis=1
    )
    features['is_human_voice'] = features.apply(
        lambda r: detect_human_voice(r['classification'], r['fft_result']['freqs'], r['fft_result']['magnitudes']),
        axis=1
    )
    features['meow_loudness'] = features.apply(
        lambda r: calculate_meow_loudness(r['is_cat_voice'], r['fft_result']['magnitudes']),
        axis=1
    )
    features['dominant_frequency'] = fft_res.apply(
        lambda x: calculate_dominant_frequency(x['freqs'], x['magnitudes'])
    )
    return features

# execute
trans_audio_features = transform_audio('stg_audio_data.csv')

In [21]:
trans_audio_features.head()

Unnamed: 0,frame_id,timestamp,fft_result,classification,is_cat_voice,is_human_voice,meow_loudness,dominant_frequency
0,0,2025-07-23 11:20:32.897810,"{'freqs': [0.0, 4.0, 8.0, 12.0, 16.0, 20.0, 24...",Noise,0,0,none,0.0
1,1,2025-07-23 11:20:32.922406,"{'freqs': [0.0, 4.0, 8.0, 12.0, 16.0, 20.0, 24...",Noise,0,0,none,0.0
2,2,2025-07-23 11:20:32.960113,"{'freqs': [0.0, 4.0, 8.0, 12.0, 16.0, 20.0, 24...",Noise,0,0,none,0.0
3,3,2025-07-23 11:20:32.981657,"{'freqs': [0.0, 4.0, 8.0, 12.0, 16.0, 20.0, 24...",Noise,0,0,none,0.0
4,4,2025-07-23 11:20:33.047383,"{'freqs': [0.0, 4.0, 8.0, 12.0, 16.0, 20.0, 24...",Noise,0,0,none,0.0


In [22]:
# Cell 3: IMU Transformation

def unwrap_yaw(yaw_list):
    arr = np.array(yaw_list, dtype=float)
    return np.degrees(np.unwrap(np.radians(arr)))


def avg_intra_yaw_diff(yaw_list):
    un = unwrap_yaw(yaw_list)
    return np.diff(un).mean() if un.size > 1 else 0.0


def compute_rotation_speed(yaw_list, prev_avg):
    cur = avg_intra_yaw_diff(yaw_list)
    if prev_avg is None:
        delta = 0.0
    else:
        delta = cur - prev_avg
    return abs(delta), cur, delta


def compute_movement_intensity(dy, dp, dr):
    return np.sqrt(dy**2 + dp**2 + dr**2)


def compute_balance_state(pitch, roll, intensity):
    return abs(pitch) < 15 and abs(roll) < 15 and intensity < 0.3


def compute_cat_interaction(intensity):
    return intensity > 10


def compute_is_rest(intensity):
    return intensity < 1


def process_frame(row, state):
    rot, cur_avg, dy = compute_rotation_speed(row['yaw'], state.get('prev_avg_yaw'))
    p = row['pitch']
    r = row['roll']
    # Pitch delta
    prev_pitch = state.get('prev_pitch')
    if prev_pitch is None:
        dp = 0.0
    else:
        dp = p - prev_pitch
    # Roll delta
    prev_roll = state.get('prev_roll')
    if prev_roll is None:
        dr = 0.0
    else:
        dr = r - prev_roll
    # Movement intensity and states
    inten = compute_movement_intensity(dy, dp, dr)
    bal = compute_balance_state(p, r, inten)
    cat_int = compute_cat_interaction(inten)
    rest = compute_is_rest(inten)
    # Update state
    state['prev_avg_yaw'] = cur_avg
    state['prev_pitch'] = p
    state['prev_roll'] = r
    return {
        'frame_id': row['frame_id'], 'timestamp': row['timestamp'],
        'rotation_speed': rot, 'movement_intensity': inten,
        'balance_state': bal, 'cat_interaction_detected': cat_int,
        'is_rest': rest, 'delta_yaw': dy, 'delta_pitch': dp, 'delta_roll': dr
    }


def process_imu_live(df):
    state = {'prev_avg_yaw': None, 'prev_pitch': None, 'prev_roll': None}
    results = []
    for _, row in df.iterrows():
        results.append(process_frame(row, state))
    return pd.DataFrame(results)


def transform_imu(path):
    df = pd.read_csv(path)
    df['yaw'] = df['yaw'].apply(ast.literal_eval)
    df['pitch'] = df['pitch'].apply(ast.literal_eval).apply(lambda x: x[0] if isinstance(x, list) else x)
    df['roll'] = df['roll'].apply(ast.literal_eval).apply(lambda x: x[0] if isinstance(x, list) else x)
    df_prepped = df[['frame_id', 'timestamp', 'yaw', 'pitch', 'roll']]
    return process_imu_live(df_prepped)

# execute
trans_imu_features = transform_imu('stg_imu_data.csv')

In [23]:
trans_imu_features.head()

Unnamed: 0,frame_id,timestamp,rotation_speed,movement_intensity,balance_state,cat_interaction_detected,is_rest,delta_yaw,delta_pitch,delta_roll
0,0,2025-07-23 11:20:32.905018,0.0,0.0,True,False,True,0.0,0.0,0.0
1,1,2025-07-23 11:20:33.111430,0.0,0.420476,False,False,True,0.0,0.18,-0.38
2,2,2025-07-23 11:20:33.324075,0.0075,0.380206,False,False,True,-0.0075,-0.38,-0.01
3,3,2025-07-23 11:20:33.540590,0.0025,0.164031,True,False,True,-0.0025,0.13,0.1
4,4,2025-07-23 11:20:33.754385,0.0075,0.145795,True,False,True,0.0075,0.04,0.14


In [24]:
# Cell 4: Visual Transformation

def load_staging_csv(path):
    return pd.read_csv(path, converters={'frame_data': str})

def jpeg_b64_to_rgb_ndarray(b64, img_size=640):
    buf = base64.b64decode(b64)
    with Image.open(io.BytesIO(buf)) as im:
        return np.array(im.convert('RGB').resize((img_size, img_size), Image.LANCZOS))

def transform_visual(path, img_size=640, conf_thr=0.05, device='cpu'):
    model = YOLO('yolov8n.pt').to(device)
    model.fuse()
    model.overrides['conf'] = conf_thr
    model.overrides['classes'] = [15]
    df = load_staging_csv(path)
    rows = []
    for _, r in df.iterrows():
        rgb = jpeg_b64_to_rgb_ndarray(r['frame_data'], img_size)
        pil = Image.fromarray(rgb)
        res = model(pil, imgsz=img_size, verbose=False)[0]
        boxes = res.boxes.cpu()
        det = pd.DataFrame({
            'xmin': boxes.xyxy[:,0].numpy(), 'ymin': boxes.xyxy[:,1].numpy(),
            'xmax': boxes.xyxy[:,2].numpy(), 'ymax': boxes.xyxy[:,3].numpy(),
            'confidence': boxes.conf.numpy(), 'class': boxes.cls.numpy().astype(int),
            'name': ['cat']*len(boxes)
        })
        rows.append({
            'frame_id': int(r['frame_id']), 'timestamp': r['timestamp'],
            'is_cat_detected': int(len(det)>0),
            'cat_confidence': float(det['confidence'].max()) if len(det) else 0.0,
            'inference_time': res.speed['inference'] if hasattr(res, 'speed') else None,
            'raw_detection': det.to_dict('records')
        })
    return pd.DataFrame(rows)

# execute
trans_visual_cat_detection = transform_visual('stg_visual_data.csv')

YOLOv8n summary (fused): 72 layers, 3,151,904 parameters, 0 gradients, 8.7 GFLOPs


In [25]:
trans_visual_cat_detection.head()

Unnamed: 0,frame_id,timestamp,is_cat_detected,cat_confidence,inference_time,raw_detection
0,0,2025-07-23 11:20:33.288155,0,0.0,64.26375,[]
1,1,2025-07-23 11:20:33.531604,0,0.0,51.638375,[]
2,2,2025-07-23 11:20:33.755833,0,0.0,52.656291,[]
3,3,2025-07-23 11:20:33.992241,0,0.0,53.889541,[]
4,4,2025-07-23 11:20:34.212651,0,0.0,43.873541,[]


In [26]:
# Cell 5: Mart Layer Assembly

def build_mrt_experiences(aud_df, imu_df, vis_df, N_FRAMES=12):
    rows = []
    for fid in sorted(vis_df['frame_id'].unique()):
        aud = aud_df[aud_df['frame_id'] <= fid].tail(N_FRAMES)
        imu = imu_df[imu_df['frame_id'] <= fid].tail(N_FRAMES)
        vis = vis_df[vis_df['frame_id'] <= fid].tail(N_FRAMES)
        if len(vis) < N_FRAMES:
            rows.append({
                'experience_id': fid,
                'last_experience_id_array': np.nan,
                'timestamp': np.nan,
                'is_cat_voice': np.nan,
                'is_human_voice': np.nan,
                'human_voice_sequence': np.nan,
                'cat_voice_sequence': np.nan,
                'meow_loudness': np.nan,
                'cat_detected': np.nan,
                'cat_position_x': np.nan,
                'cat_position_y': np.nan,
                'cat_movement_direction': np.nan,
                'cat_activity_level': np.nan,
                'cat_distance_change': np.nan,
                'movement_intensity': np.nan,
                'cat_interaction_detected': np.nan
            })
            continue
        aud_is_cat = aud['is_cat_voice'].fillna(False).astype(bool)
        aud_is_human = aud['is_human_voice'].fillna(False).astype(bool)
        human_seq = aud.loc[aud_is_human, 'frame_id'].tolist()
        cat_seq = aud.loc[aud_is_cat, 'frame_id'].tolist()
        meow_loud = aud['meow_loudness'].mode().iloc[0] if not aud['meow_loudness'].empty else np.nan
        move_int = imu['movement_intensity'].mean() if 'movement_intensity' in imu else np.nan
        cat_int = bool(imu['cat_interaction_detected'].any()) if 'cat_interaction_detected' in imu else False
        vis_last = vis.iloc[-1]
        cat_detected = any(any(d.get('name')=='cat' for d in dets) for dets in vis['raw_detection'])
        cat_x = cat_y = prev_x = prev_y = np.nan
        for d in vis_last['raw_detection']:
            if d.get('name')=='cat':
                cat_x = (d['xmin']+d['xmax'])/2
                cat_y = (d['ymin']+d['ymax'])/2
                break
        if len(vis)>1:
            for d in vis.iloc[-2]['raw_detection']:
                if d.get('name')=='cat':
                    prev_x = (d['xmin']+d['xmax'])/2
                    prev_y = (d['ymin']+d['ymax'])/2
                    break
        dx = cat_x - prev_x if np.isfinite(cat_x) and np.isfinite(prev_x) else 0
        dy = cat_y - prev_y if np.isfinite(cat_y) and np.isfinite(prev_y) else 0
        if dx < -2:
            direction = 'left'
        elif dx > 2:
            direction = 'right'
        elif dy < -2:
            direction = 'towards'
        elif dy > 2:
            direction = 'away'
        else:
            direction = 'stationary'
        activity = 'moving' if max(abs(dx),abs(dy))>3 else 'still'
        if dy < -2:
            dist = 'closer'
        elif dy > 2:
            dist = 'farther'
        else:
            dist = 'no_change'
        rows.append({
            'experience_id': fid,
            'last_experience_id_array': vis['frame_id'].tolist(),
            'timestamp': vis_last['timestamp'],
            'is_cat_voice': bool(aud_is_cat.any()),
            'is_human_voice': bool(aud_is_human.any()),
            'human_voice_sequence': human_seq,
            'cat_voice_sequence': cat_seq,
            'meow_loudness': meow_loud,
            'cat_detected': cat_detected,
            'cat_position_x': cat_x,
            'cat_position_y': cat_y,
            'cat_movement_direction': direction,
            'cat_activity_level': activity,
            'cat_distance_change': dist,
            'movement_intensity': move_int,
            'cat_interaction_detected': cat_int
        })
    return pd.DataFrame(rows)

In [27]:
# execute
mrt_experiences = build_mrt_experiences(trans_audio_features, trans_imu_features, trans_visual_cat_detection)
mrt_experiences

Unnamed: 0,experience_id,last_experience_id_array,timestamp,is_cat_voice,is_human_voice,human_voice_sequence,cat_voice_sequence,meow_loudness,cat_detected,cat_position_x,cat_position_y,cat_movement_direction,cat_activity_level,cat_distance_change,movement_intensity,cat_interaction_detected
0,0,,,,,,,,,,,,,,,
1,1,,,,,,,,,,,,,,,
2,2,,,,,,,,,,,,,,,
3,3,,,,,,,,,,,,,,,
4,4,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,77,"[66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77]",2025-07-23 11:20:51.981008,False,True,[73],[],none,False,,,stationary,still,no_change,1.550708,False
78,78,"[67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78]",2025-07-23 11:20:52.195276,False,True,[73],[],none,False,,,stationary,still,no_change,1.391877,False
79,79,"[68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]",2025-07-23 11:20:52.417412,False,True,[73],[],none,False,,,stationary,still,no_change,1.347606,False
80,80,"[69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80]",2025-07-23 11:20:52.656378,False,True,[73],[],none,False,,,stationary,still,no_change,1.241126,False


In [28]:
# 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