# Process original dataset

In [45]:
import os
import re
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta

In [None]:
def find_all_dirs(root_dir):
    person_dirs = []
    for dirpath, dirnames, _ in os.walk(root_dir):
        for dirname in dirnames:
            if dirname.startswith("person"):
                person_dirs.append(os.path.join(dirpath, dirname))
    return sorted(person_dirs)

def get_meta(person_dir: str):
    person_id = os.path.basename(person_dir)[-3:]
    meta_path = f"/fast/dataset/baby-motion/dataset/00_sequence/person{person_id}/{person_id}_meta.csv"
    gender = ""
    age = None
    try:
        meta_df = pd.read_csv(meta_path, header=None, names=["key", "value"], encoding='utf-8-sig')
        meta_df["key"] = meta_df["key"].str.strip().str.lower()
        meta_df["value"] = meta_df["value"].astype(str).str.strip()
        data = dict(zip(meta_df["key"], meta_df["value"]))
        gender = data.get("gender", "").lower()
        age_match = re.search(r'\d+', data.get("age", ""))
        if age_match:
            age = int(age_match.group())
    except Exception as e:
        print(f"[Warning] Failed to read meta file: {e}")
    return person_id, gender, age

def get_label_in_csv(person_dir: str):
    labels = []
    person_id, gender, age = get_meta(person_dir)
    label_path = os.path.join(person_dir, f"{person_id}_label.csv")
    data_path = os.path.join(person_dir, f"{person_id}.csv")
    try:
        df = pd.read_csv(label_path, header=None)
    except:
        return [], data_path
    for _, row in df.iterrows():
        try:
            if ' ' in row[0]:  # (datetime_start, datetime_end, action)
                t0 = datetime.strptime(row[0], "%Y/%m/%d %H:%M:%S")
                t1 = datetime.strptime(row[1], "%Y/%m/%d %H:%M:%S")
                action = row[2]
            else:  # (date, t_start, t_end, action)
                t0 = datetime.strptime(row[0] + " " + row[1], "%Y/%m/%d %H:%M:%S")
                t1 = datetime.strptime(row[0] + " " + row[2], "%Y/%m/%d %H:%M:%S")
                action = row[3]
            if not action:
                continue
            label = {
                "person_id": person_id,
                "gender": gender,
                "age": age,
                "dt_start": t0.strftime("%Y-%m-%d %H:%M:%S"),
                "dt_end": t1.strftime("%Y-%m-%d %H:%M:%S"),
                "dur": (t1 - t0).total_seconds(),
                "action": action
            }
            labels.append(label)
        except:
            continue
    return labels, data_path

def get_label_in_filename(person_dir: str, max_gap_seconds: int = 60):
    person_id, gender, age = get_meta(person_dir)
    data_path = os.path.join(person_dir, f"{person_id}.csv")
    action = os.path.basename(os.path.dirname(person_dir))
    labels = []
    try:
        df = pd.read_csv(data_path, encoding="utf-8-sig")
        time_series = df.iloc[:, 0].dropna().astype(str)
        timestamps = [datetime.strptime(ts, "%Y-%m-%d %H:%M:%S.%f") for ts in time_series]
        start_idx = 0
        for i in range(1, len(timestamps)):
            gap = (timestamps[i] - timestamps[i - 1]).total_seconds()
            if gap > max_gap_seconds:
                t_start = timestamps[start_idx].replace(microsecond=0)
                t_end = timestamps[i - 1]
                if t_end.microsecond > 0:
                    t_end = t_end.replace(microsecond=0) + timedelta(seconds=1)
                else:
                    t_end = t_end.replace(microsecond=0)
                labels.append({
                    "person_id": person_id,
                    "gender": gender,
                    "age": age,
                    "dt_start": t_start.strftime("%Y-%m-%d %H:%M:%S"),
                    "dt_end": t_end.strftime("%Y-%m-%d %H:%M:%S"),
                    "dur": (t_end - t_start).total_seconds(),
                    "action": action,
                })
                start_idx = i
        t_start = timestamps[start_idx].replace(microsecond=0)
        t_end = timestamps[-1]
        if t_end.microsecond > 0:
            t_end = t_end.replace(microsecond=0) + timedelta(seconds=1)
        else:
            t_end = t_end.replace(microsecond=0)
        if (t_end - t_start).total_seconds() > 1:
            labels.append({
                # "person_id": person_id,
                "gender": gender,
                "age": age,
                # "dt_start": t_start.strftime("%Y-%m-%d %H:%M:%S"),
                # "dt_end": t_end.strftime("%Y-%m-%d %H:%M:%S"),
                "dur": (t_end - t_start).total_seconds(),
                "action": action,
            })
        return labels, data_path
    except Exception as e:
        print(f"[Warning] Failed to parse file: {e}")
    return labels, data_path

In [41]:
get_label_in_csv("/fast/dataset/baby-motion/dataset/00_sequence/person001")

([{'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 11:39:37',
   'dt_end': '2022-08-28 11:39:39',
   'dur': 2.0,
   'action': 'roll-over'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 11:40:03',
   'dt_end': '2022-08-28 11:40:27',
   'dur': 24.0,
   'action': 'roll-over'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 11:41:42',
   'dt_end': '2022-08-28 11:42:00',
   'dur': 18.0,
   'action': 'roll-over'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 11:42:20',
   'dt_end': '2022-08-28 12:12:09',
   'dur': 1789.0,
   'action': 'face-down'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 12:12:10',
   'dt_end': '2022-08-28 12:12:22',
   'dur': 12.0,
   'action': 'roll-over'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 12:12:22',
   'dt_end': '20

In [48]:
get_label_in_filename("/fast/dataset/baby-motion/dataset/01_move/crawl/person001")

([{'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-08-28 13:10:18',
   'dt_end': '2022-08-28 13:10:22',
   'dur': 4.0,
   'action': 'crawl'},
  {'person_id': '001',
   'gender': 'female',
   'age': 16,
   'dt_start': '2022-09-03 19:11:56',
   'dt_end': '2022-09-03 19:12:08',
   'dur': 12.0,
   'action': 'crawl'}],
 '/fast/dataset/baby-motion/dataset/01_move/crawl/person001/001.csv')

In [60]:
input_dirs = find_all_dirs(root_dir="/fast/dataset/baby-motion/dataset")
data_save_dir = "/fast/workspace/robinson/CodeSource/babycare/data_origin/sequence"
label_save_dir = "/fast/workspace/robinson/CodeSource/babycare/data_origin/label"
os.makedirs(data_save_dir, exist_ok=True)
os.makedirs(label_save_dir, exist_ok=True)

index = 0
for person_dir in tqdm(input_dirs):
    try:
        labels, data_path = get_label_in_csv(person_dir) if "sequence" in person_dir else get_label_in_filename(person_dir)
        data_df = pd.read_csv(data_path)
        data_df['datetime'] = pd.to_datetime(data_df['datetime'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
        data_df = data_df.dropna(subset=['datetime'])
        for label in labels:
            try:
                t_start = pd.to_datetime(label['dt_start'])
                t_end = pd.to_datetime(label['dt_end'])
                segment = data_df[(data_df['datetime'] >= t_start) & (data_df['datetime'] <= t_end)]
                if segment.empty:
                    continue
                sequence_filename = f"{index:06d}.csv"
                segment[['accel_x', 'accel_y', 'accel_z']].to_csv(os.path.join(data_save_dir, sequence_filename), index=False)
                label_filename = f"{index:06d}_label.csv"
                pd.DataFrame([label]).to_csv(os.path.join(label_save_dir, label_filename), index=False)
                index += 1
            except Exception as e:
                print(f"[Warning] Failed to process segment: {e}")
    except Exception as e:
        print(f"[Warning] Failed to process person dir: {e}")
        continue

  6%|▌         | 5/83 [00:03<00:46,  1.68it/s]



 22%|██▏       | 18/83 [00:04<00:05, 11.04it/s]



 45%|████▍     | 37/83 [00:05<00:01, 29.48it/s]



 59%|█████▉    | 49/83 [00:05<00:01, 27.16it/s]



 71%|███████   | 59/83 [00:05<00:00, 25.93it/s]



100%|██████████| 83/83 [00:09<00:00,  8.37it/s]


# Split train-set and validation-set 

In [None]:
import os
import random
import pandas as pd
from collections import defaultdict

origin_dir = './data_origin/'
train_ratio = 0.8

In [None]:
train_txt = os.path.join(origin_dir, 'train.txt')
val_txt = os.path.join(origin_dir, 'val.txt')

all_ids = sorted([
    f[:-4] for f in os.listdir(origin_sequence_dir)
    if f.endswith('.csv') and f.startswith('000')
])

random.seed(42)
random.shuffle(all_ids)
split_idx = int(len(all_ids) * train_ratio)
train_ids = sorted(all_ids[:split_idx])
val_ids = sorted(all_ids[split_idx:])

with open(train_txt, 'w') as f:
    f.writelines([id_ + '\n' for id_ in train_ids])
with open(val_txt, 'w') as f:
    f.writelines([id_ + '\n' for id_ in val_ids])

print(f"division completed：\ntrain-set（{len(train_ids)}）→ {train_txt}\nval-set（{len(val_ids)}）→ {val_txt}")

division completed：
train-set（440）→ ./data_origin/train.txt
val-set（110）→ ./data_origin/val.txt


In [None]:
data_root = os.path.dirname(origin_sequence_dir)
train_txt = os.path.join(data_root, 'train.txt')
val_txt = os.path.join(data_root, 'val.txt')
train_ratio = 0.8

# action -> [sample_id, ...]
action2ids = defaultdict(list)

for label_file in os.listdir(origin_label_dir):
    if not label_file.endswith('_label.csv'):
        continue

    sample_id = label_file.replace('_label.csv', '')
    label_path = os.path.join(origin_label_dir, label_file)

    try:
        df = pd.read_csv(label_path)
        if 'action' not in df.columns:
            continue
        action = df.loc[0, 'action']
        action2ids[action].append(sample_id)
    except Exception as e:
        print(f"[Warning] 跳过出错文件: {label_file} ({e})")


train_ids, val_ids = [], []

random.seed(42)
for action, ids in action2ids.items():
    random.shuffle(ids)
    split_idx = int(len(ids) * train_ratio)
    train_ids.extend(ids[:split_idx])
    val_ids.extend(ids[split_idx:])

train_ids.sort()
val_ids.sort()

with open(train_txt, 'w') as f:
    f.writelines([id_ + '\n' for id_ in sorted(train_ids)])
with open(val_txt, 'w') as f:
    f.writelines([id_ + '\n' for id_ in sorted(val_ids)])


print(f"division completed：")
for action in sorted(action2ids.keys()):
    total = len(action2ids[action])
    train_count = len([i for i in train_ids if i in action2ids[action]])
    val_count = len([i for i in val_ids if i in action2ids[action]])
    print(f"  - {action:<15} : train={train_count:<3}, val={val_count:<3}, total={total}")

division completed：
  - baby-food       : train=84 , val=21 , total=105
  - bottle          : train=25 , val=7  , total=32
  - breast          : train=0  , val=1  , total=1
  - crawl           : train=22 , val=6  , total=28
  - face-down       : train=11 , val=3  , total=14
  - face-side       : train=14 , val=4  , total=18
  - face-up         : train=9  , val=3  , total=12
  - hold-horizontal : train=50 , val=13 , total=63
  - hold-vertical   : train=56 , val=15 , total=71
  - piggyback       : train=35 , val=9  , total=44
  - roll-over       : train=33 , val=9  , total=42
  - sit-floor       : train=41 , val=11 , total=52
  - sit-high-chair  : train=8  , val=3  , total=11
  - sit-low-chair   : train=4  , val=1  , total=5
  - stand           : train=11 , val=3  , total=14
  - walk            : train=30 , val=8  , total=38
