# Process original dataset

In [45]:
import os
import re
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta

In [None]:
def find_all_dirs(root_dir):
    person_dirs = []
    for dirpath, dirnames, _ in os.walk(root_dir):
        for dirname in dirnames:
            if dirname.startswith("person"):
                person_dirs.append(os.path.join(dirpath, dirname))
    return sorted(person_dirs)

def get_meta(person_dir: str):
    person_id = os.path.basename(person_dir)[-3:]
    meta_path = f"/fast/dataset/baby-motion/dataset/00_sequence/person{person_id}/{person_id}_meta.csv"
    gender = ""
    age = None
    try:
        meta_df = pd.read_csv(meta_path, header=None, names=["key", "value"], encoding='utf-8-sig')
        meta_df["key"] = meta_df["key"].str.strip().str.lower()
        meta_df["value"] = meta_df["value"].astype(str).str.strip()
        data = dict(zip(meta_df["key"], meta_df["value"]))
        gender = data.get("gender", "").lower()
        age_match = re.search(r'\d+', data.get("age", ""))
        if age_match:
            age = int(age_match.group())
    except Exception as e:
        print(f"[Warning] Failed to read meta file: {e}")
    return person_id, gender, age

def get_label_in_csv(person_dir: str):
    labels = []
    person_id, gender, age = get_meta(person_dir)
    label_path = os.path.join(person_dir, f"{person_id}_label.csv")
    data_path = os.path.join(person_dir, f"{person_id}.csv")
    try:
        df = pd.read_csv(label_path, header=None)
    except:
        return [], data_path
    for _, row in df.iterrows():
        try:
            if ' ' in row[0]:  # (datetime_start, datetime_end, action)
                t0 = datetime.strptime(row[0], "%Y/%m/%d %H:%M:%S")
                t1 = datetime.strptime(row[1], "%Y/%m/%d %H:%M:%S")
                action = row[2]
            else:  # (date, t_start, t_end, action)
                t0 = datetime.strptime(row[0] + " " + row[1], "%Y/%m/%d %H:%M:%S")
                t1 = datetime.strptime(row[0] + " " + row[2], "%Y/%m/%d %H:%M:%S")
                action = row[3]
            if not action:
                continue
            label = {
                "person_id": person_id,
                "gender": gender,
                "age": age,
                "dt_start": t0.strftime("%Y-%m-%d %H:%M:%S"),
                "dt_end": t1.strftime("%Y-%m-%d %H:%M:%S"),
                "dur": (t1 - t0).total_seconds(),
                "action": action
            }
            labels.append(label)
        except:
            continue
    return labels, data_path

def get_label_in_filename(person_dir: str, max_gap_seconds: int = 60):
    person_id, gender, age = get_meta(person_dir)
    data_path = os.path.join(person_dir, f"{person_id}.csv")
    action = os.path.basename(os.path.dirname(person_dir))
    labels = []
    try:
        df = pd.read_csv(data_path, encoding="utf-8-sig")
        time_series = df.iloc[:, 0].dropna().astype(str)
        timestamps = [datetime.strptime(ts, "%Y-%m-%d %H:%M:%S.%f") for ts in time_series]
        start_idx = 0
        for i in range(1, len(timestamps)):
            gap = (timestamps[i] - timestamps[i - 1]).total_seconds()
            if gap > max_gap_seconds:
                t_start = timestamps[start_idx].replace(microsecond=0)
                t_end = timestamps[i - 1]
                if t_end.microsecond > 0:
                    t_end = t_end.replace(microsecond=0) + timedelta(seconds=1)
                else:
                    t_end = t_end.replace(microsecond=0)
                labels.append({
                    "person_id": person_id,
                    "gender": gender,
                    "age": age,
                    "dt_start": t_start.strftime("%Y-%m-%d %H:%M:%S"),
                    "dt_end": t_end.strftime("%Y-%m-%d %H:%M:%S"),
                    "dur": (t_end - t_start).total_seconds(),
                    "action": action,
                })
                start_idx = i
        t_start = timestamps[start_idx].replace(microsecond=0)
        t_end = timestamps[-1]
        if t_end.microsecond > 0:
            t_end = t_end.replace(microsecond=0) + timedelta(seconds=1)
        else:
            t_end = t_end.replace(microsecond=0)
        if (t_end - t_start).total_seconds() > 1:
            labels.append({
                # "person_id": person_id,
                "gender": gender,
                "age": age,
                # "dt_start": t_start.strftime("%Y-%m-%d %H:%M:%S"),
                # "dt_end": t_end.strftime("%Y-%m-%d %H:%M:%S"),
                "dur": (t_end - t_start).total_seconds(),
                "action": action,
            })
        return labels, data_path
    except Exception as e:
        print(f"[Warning] Failed to parse file: {e}")
    return labels, data_path

In [41]:
get_label_in_csv("/fast/dataset/baby-motion/dataset/00_sequence/person001")

([{'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 11:39:37',
   'dt_end': '2022-08-28 11:39:39',
   'dur': 2.0,
   'action': 'roll-over'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 11:40:03',
   'dt_end': '2022-08-28 11:40:27',
   'dur': 24.0,
   'action': 'roll-over'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 11:41:42',
   'dt_end': '2022-08-28 11:42:00',
   'dur': 18.0,
   'action': 'roll-over'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 11:42:20',
   'dt_end': '2022-08-28 12:12:09',
   'dur': 1789.0,
   'action': 'face-down'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 12:12:10',
   'dt_end': '2022-08-28 12:12:22',
   'dur': 12.0,
   'action': 'roll-over'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 12:12:22',
   'dt_end': '20

In [48]:
get_label_in_filename("/fast/dataset/baby-motion/dataset/01_move/crawl/person001")

([{'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 13:10:18',
   'dt_end': '2022-08-28 13:10:22',
   'dur': 4.0,
   'action': 'crawl'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-09-03 19:11:56',
   'dt_end': '2022-09-03 19:12:08',
   'dur': 12.0,
   'action': 'crawl'}],
 '/fast/dataset/baby-motion/dataset/01_move/crawl/person001/001.csv')

In [60]:
input_dirs = find_all_dirs(root_dir="/fast/dataset/baby-motion/dataset")
data_save_dir = "/fast/workspace/robinson/CodeSource/babycare/data_origin/sequence"
label_save_dir = "/fast/workspace/robinson/CodeSource/babycare/data_origin/label"
os.makedirs(data_save_dir, exist_ok=True)
os.makedirs(label_save_dir, exist_ok=True)

index = 0
for person_dir in tqdm(input_dirs):
    try:
        labels, data_path = get_label_in_csv(person_dir) if "sequence" in person_dir else get_label_in_filename(person_dir)
        data_df = pd.read_csv(data_path)
        data_df['datetime'] = pd.to_datetime(data_df['datetime'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
        data_df = data_df.dropna(subset=['datetime'])
        for label in labels:
            try:
                t_start = pd.to_datetime(label['dt_start'])
                t_end = pd.to_datetime(label['dt_end'])
                segment = data_df[(data_df['datetime'] >= t_start) & (data_df['datetime'] <= t_end)]
                if segment.empty:
                    continue
                sequence_filename = f"{index:06d}.csv"
                segment[['accel_x', 'accel_y', 'accel_z']].to_csv(os.path.join(data_save_dir, sequence_filename), index=False)
                label_filename = f"{index:06d}_label.csv"
                pd.DataFrame([label]).to_csv(os.path.join(label_save_dir, label_filename), index=False)
                index += 1
            except Exception as e:
                print(f"[Warning] Failed to process segment: {e}")
    except Exception as e:
        print(f"[Warning] Failed to process person dir: {e}")
        continue

  6%|▌         | 5/83 [00:03<00:46,  1.68it/s]



 22%|██▏       | 18/83 [00:04<00:05, 11.04it/s]



 45%|████▍     | 37/83 [00:05<00:01, 29.48it/s]



 59%|█████▉    | 49/83 [00:05<00:01, 27.16it/s]



 71%|███████   | 59/83 [00:05<00:00, 25.93it/s]



100%|██████████| 83/83 [00:09<00:00,  8.37it/s]


# Check dataset format

In [12]:
import os
import pandas as pd

origin_sequence_dir = './data_origin/sequence'
origin_label_dir = './data_origin/label'
aug_sequence_dir = './data_aug/ChatGPT-o4-instructed/sequence'
aug_label_dir = './data_aug/ChatGPT-o4-instructed/label'

sampling_interval = 0.15  # s/frame
tolerance = 0.15

In [None]:
# check if the fromat correct

errors = []
sequence_files = sorted(f for f in os.listdir(aug_sequence_dir) if f.endswith('.csv'))
label_files = sorted(f for f in os.listdir(aug_label_dir) if f.endswith('.csv'))

for seq_file in sequence_files:
    base = os.path.splitext(seq_file)[0]
    label_file = f"{base}_label.csv"

    seq_path = os.path.join(aug_sequence_dir, seq_file)
    label_path = os.path.join(aug_label_dir, label_file)

    # check if label exist
    if not os.path.exists(label_path):
        errors.append(f"Label file missing for {seq_file}")
        continue

    # check sequence format
    try:
        df_seq = pd.read_csv(seq_path)
        if df_seq.shape[1] != 3:
            errors.append(f"{seq_file} should have 3 columns, found {df_seq.shape[1]}")
        if df_seq.isnull().any().any():
            errors.append(f"{seq_file} contains NaN values")
    except Exception as e:
        errors.append(f"Failed to read {seq_file}: {e}")

    # check label format
    try:
        df_label = pd.read_csv(label_path)
        expected_cols = ['gender', 'age', 'action', 'dur']
        if list(df_label.columns) != expected_cols:
            errors.append(f"{label_file} header mismatch. Expected {expected_cols}, got {list(df_label.columns)}")
        if df_label.shape[0] != 1:
            errors.append(f"{label_file} should contain exactly 1 row, found {df_label.shape[0]}")
    except Exception as e:
        errors.append(f"Failed to read {label_file}: {e}")
if errors:
    print("\n".join(errors))
else:
    print("All files passed the format check.")


All files passed.


In [None]:
# check if the duration plausible

errors = []
for filename in sorted(os.listdir(aug_sequence_dir)):
    if not filename.endswith(".csv"):
        continue

    base_id = os.path.splitext(filename)[0]  # A00001
    sequence_path = os.path.join(aug_sequence_dir, filename)
    label_path = os.path.join(aug_label_dir, f"{base_id}_label.csv")

    try:
        seq_df = pd.read_csv(sequence_path)
        label_df = pd.read_csv(label_path)

        num_frames = len(seq_df)
        duration = float(label_df.iloc[0]['dur'])

        expected_duration = num_frames * sampling_interval
        delta = abs(expected_duration - duration)

        if delta > tolerance:
            errors.append((base_id, num_frames, duration, expected_duration, delta))
    except Exception as e:
        print(f"can't load data of {base_id}: {e}")

if errors:
    print("These files' duration in label is implausible：")
    for base_id, frames, label_dur, expected_dur, delta in errors:
        print(f" - {base_id}: num_frames={frames}, duration={label_dur:.2f}s, expexted duration={expected_dur:.2f}s, delta={delta:.2f}s")
else:
    print("All duration in labels are plausible")

These files' duration in label is implausible：
 - A00001: num_frames=55, duration=8.10s, expexted duration=8.25s, delta=0.15s
 - A00002: num_frames=61, duration=9.00s, expexted duration=9.15s, delta=0.15s
 - A00004: num_frames=56, duration=8.25s, expexted duration=8.40s, delta=0.15s
 - A00015: num_frames=45, duration=6.60s, expexted duration=6.75s, delta=0.15s
 - A00019: num_frames=50, duration=7.35s, expexted duration=7.50s, delta=0.15s
 - A00022: num_frames=62, duration=9.00s, expexted duration=9.30s, delta=0.30s
 - A00023: num_frames=56, duration=8.25s, expexted duration=8.40s, delta=0.15s
 - A00026: num_frames=61, duration=9.00s, expexted duration=9.15s, delta=0.15s
 - A00029: num_frames=58, duration=9.00s, expexted duration=8.70s, delta=0.30s
 - A00030: num_frames=50, duration=7.20s, expexted duration=7.50s, delta=0.30s
 - A00034: num_frames=51, duration=7.80s, expexted duration=7.65s, delta=0.15s
 - A00036: num_frames=48, duration=7.50s, expexted duration=7.20s, delta=0.30s
 - A0

In [None]:
# detect action classes in labels

def collect_action_classes(label_root):
    all_actions = set()
    for filename in sorted(os.listdir(label_root)):
        if filename.endswith('.csv'):
            filepath = os.path.join(label_root, filename)
            try:
                df = pd.read_csv(filepath)
                if 'action' not in df.columns:
                    print(f"no action feature for {filename}")
                    continue
                actions = df['action'].dropna().unique()
                all_actions.update(actions)
            except Exception as e:
                print(f"fail to load {filename}: {e}")
    return sorted(list(all_actions))

original_actions = collect_action_classes(origin_label_dir)
augmented_actions = collect_action_classes(aug_label_dir)

print("\naction classes in origin dataset：", original_actions)
print("action classes in augmented dataset：", augmented_actions)


action classes in origin dataset： ['baby-food', 'bottle', 'breast', 'crawl', 'face-down', 'face-side', 'face-up', 'hold-horizontal', 'hold-vertical', 'piggyback', 'roll-over', 'sit-floor', 'sit-high-chair', 'sit-low-chair', 'stand', 'walk']
action classes in augmented dataset： ['baby-food', 'bottle', 'crawl', 'face-down', 'face-side', 'face-up', 'hold-horizontal', 'hold-vertical', 'piggyback', 'roll-over', 'sit-floor', 'sit-high-chair', 'sit-low-chair', 'stand', 'walk']


In [14]:
# detect num of data sample for specific action class

target_action = 'crawl' 

matched_filenames = []
for filename in sorted(os.listdir(origin_label_dir)):
    if filename.endswith('_label.csv'):
        filepath = os.path.join(origin_label_dir, filename)
        df = pd.read_csv(filepath)

        if target_action in df['action'].values:
            file_id = filename.replace('_label.csv', '')
            matched_filenames.append(file_id)

print(f"finded out {len(matched_filenames)} samples for action class '{target_action}':")
print(matched_filenames)

finded out 28 samples for action class 'crawl':
['000008', '000031', '000042', '000044', '000050', '000052', '000054', '000056', '000075', '000078', '000088', '000090', '000093', '000182', '000183', '000370', '000371', '000372', '000373', '000374', '000375', '000376', '000377', '000378', '000379', '000380', '000381', '000382']


# Split train-set and validation-set 

In [None]:
import os
import random

train_ratio = 0.8

data_root = os.path.dirname(origin_sequence_dir)
train_txt = os.path.join(data_root, 'train.txt')
val_txt = os.path.join(data_root, 'val.txt')

all_ids = sorted([
    f[:-4] for f in os.listdir(origin_sequence_dir)
    if f.endswith('.csv') and f.startswith('000')
])

random.seed(42)
random.shuffle(all_ids)
split_idx = int(len(all_ids) * train_ratio)
train_ids = sorted(all_ids[:split_idx])
val_ids = sorted(all_ids[split_idx:])

with open(train_txt, 'w') as f:
    f.writelines([id_ + '\n' for id_ in train_ids])
with open(val_txt, 'w') as f:
    f.writelines([id_ + '\n' for id_ in val_ids])

print(f"division completed：\ntrain-set（{len(train_ids)}）→ {train_txt}\nval-set（{len(val_ids)}）→ {val_txt}")

division completed：
train-set（440）→ ./data_origin/train.txt
val-set（110）→ ./data_origin/val.txt


In [19]:
import os
import random
import pandas as pd
from collections import defaultdict

data_root = os.path.dirname(origin_sequence_dir)
train_txt = os.path.join(data_root, 'train.txt')
val_txt = os.path.join(data_root, 'val.txt')
train_ratio = 0.8

# action -> [sample_id, ...]
action2ids = defaultdict(list)

for label_file in os.listdir(origin_label_dir):
    if not label_file.endswith('_label.csv'):
        continue

    sample_id = label_file.replace('_label.csv', '')
    label_path = os.path.join(origin_label_dir, label_file)

    try:
        df = pd.read_csv(label_path)
        if 'action' not in df.columns:
            continue
        action = df.loc[0, 'action']
        action2ids[action].append(sample_id)
    except Exception as e:
        print(f"[Warning] 跳过出错文件: {label_file} ({e})")


train_ids, val_ids = [], []

random.seed(42)
for action, ids in action2ids.items():
    random.shuffle(ids)
    split_idx = int(len(ids) * train_ratio)
    train_ids.extend(ids[:split_idx])
    val_ids.extend(ids[split_idx:])

train_ids.sort()
val_ids.sort()

with open(train_txt, 'w') as f:
    f.writelines([id_ + '\n' for id_ in sorted(train_ids)])
with open(val_txt, 'w') as f:
    f.writelines([id_ + '\n' for id_ in sorted(val_ids)])


print(f"division completed：")
for action in sorted(action2ids.keys()):
    total = len(action2ids[action])
    train_count = len([i for i in train_ids if i in action2ids[action]])
    val_count = len([i for i in val_ids if i in action2ids[action]])
    print(f"  - {action:<15} : train={train_count:<3}, val={val_count:<3}, total={total}")

division completed：
  - baby-food       : train=84 , val=21 , total=105
  - bottle          : train=25 , val=7  , total=32
  - breast          : train=0  , val=1  , total=1
  - crawl           : train=22 , val=6  , total=28
  - face-down       : train=11 , val=3  , total=14
  - face-side       : train=14 , val=4  , total=18
  - face-up         : train=9  , val=3  , total=12
  - hold-horizontal : train=50 , val=13 , total=63
  - hold-vertical   : train=56 , val=15 , total=71
  - piggyback       : train=35 , val=9  , total=44
  - roll-over       : train=33 , val=9  , total=42
  - sit-floor       : train=41 , val=11 , total=52
  - sit-high-chair  : train=8  , val=3  , total=11
  - sit-low-chair   : train=4  , val=1  , total=5
  - stand           : train=11 , val=3  , total=14
  - walk            : train=30 , val=8  , total=38


# Generate the user prompt

In [21]:
import os
import json
import pandas as pd


target_action = 'crawl'
explanation = '婴儿在地板上爬行'

train_txt_path = './data_origin/train.txt'
with open(train_txt_path, 'r') as f:
    train_ids = [line.strip() for line in f if line.strip()]

result_list = []
for seq_id in train_ids:
    label_path = os.path.join(origin_label_dir, f'{seq_id}_label.csv')
    seq_path = os.path.join(origin_sequence_dir, f'{seq_id}.csv')

    if not os.path.exists(label_path) or not os.path.exists(seq_path):
        continue

    label_df = pd.read_csv(label_path)
    if 'action' not in label_df.columns or label_df.iloc[0]['action'] != target_action:
        continue

    data_df = pd.read_csv(seq_path)
    if not all(col in data_df.columns for col in ['accel_x', 'accel_y', 'accel_z']):
        continue

    # calculate mean and std
    mean_series = data_df.mean()
    std_series = data_df.std()

    sample_stat = {
        "id": seq_id,
        "accel_x": f"{mean_series['accel_x']:.9f}±{std_series['accel_x']:.9f}",
        "accel_y": f"{mean_series['accel_y']:.9f}±{std_series['accel_y']:.9f}",
        "accel_z": f"{mean_series['accel_z']:.9f}±{std_series['accel_z']:.9f}"
    }

    result_list.append(sample_stat)

# 打印
print(f"(1)模拟动作：{explanation}(action=\"{target_action}\")")
print(f"(2)统计数据：\n{json.dumps(result_list[:], indent=2, ensure_ascii=False)}")
print("请参考以上信息生成符合要求的数据。")

(1)模拟动作：婴儿在地板上爬行(action="crawl")
(2)统计数据：
[
  {
    "id": "000031",
    "accel_x": "0.728625343±0.072465999",
    "accel_y": "0.111381894±0.190818416",
    "accel_z": "0.679442633±0.103526492"
  },
  {
    "id": "000044",
    "accel_x": "0.386851241±0.168611129",
    "accel_y": "-0.390523771±0.145563532",
    "accel_z": "0.747573480±0.161554191"
  },
  {
    "id": "000050",
    "accel_x": "0.581115723±0.101970513",
    "accel_y": "-0.092643738±0.278959798",
    "accel_z": "0.658294678±0.084085999"
  },
  {
    "id": "000052",
    "accel_x": "0.417088100±0.194192399",
    "accel_y": "-0.506596157±0.128982291",
    "accel_z": "0.653477260±0.251737379"
  },
  {
    "id": "000054",
    "accel_x": "0.484242031±0.173512301",
    "accel_y": "-0.465523856±0.123724930",
    "accel_z": "0.586447579±0.161683279"
  },
  {
    "id": "000075",
    "accel_x": "0.843816848±0.119261591",
    "accel_y": "-0.410522461±0.106638037",
    "accel_z": "0.347943987±0.143504045"
  },
  {
    "id": "000078",
   

In [None]:
# Generate blank csv files

num_files = 150

# ensure dir exist
os.makedirs(aug_sequence_dir, exist_ok=True)
os.makedirs(aug_label_dir, exist_ok=True)

for i in range(num_files):
    # with the name of Axxxxx.csv
    file_id = f"A{i:05d}"
    sequence_path = os.path.join(aug_sequence_dir, f"{file_id}.csv")
    label_path = os.path.join(aug_label_dir, f"{file_id}_label.csv")
    # blank sequence file
    if not os.path.exists(sequence_path):
        with open(sequence_path, 'w') as f:
            pass
    # blank label file
    if not os.path.exists(label_path):
        with open(label_path, 'w') as f:
            pass