In [None]:
import os
import pandas as pd

In [None]:
# Parameters
origin_dir = './data_origin/'
aug_dir = './data_aug/'
aug_method = 'ChatGPT-o4-instructed_v2'

sampling_interval = 0.15  # s/frame
tolerance = 0.15

def find_pair_dirs(base_dir: str, aug_method: str=None):
    sequence_dir = os.path.join(base_dir, aug_method, "sequence") if aug_method else os.path.join(base_dir, "sequence")
    label_dir = os.path.join(aug_dir, aug_method, "label") if aug_method else os.path.join(origin_dir, "label")
    os.makedirs(sequence_dir, exist_ok=True)
    os.makedirs(label_dir, exist_ok=True)
    return sequence_dir, label_dir

origin_sequence_dir, origin_label_dir = find_pair_dirs(origin_dir)
aug_sequence_dir, aug_label_dir = find_pair_dirs(aug_dir, aug_method)

# Check aug dataset format

In [None]:
# check if the fromat correct

errors = []
sequence_files = sorted(f for f in os.listdir(aug_sequence_dir) if f.endswith('.csv'))
label_files = sorted(f for f in os.listdir(aug_label_dir) if f.endswith('.csv'))

for seq_file in sequence_files:
    base = os.path.splitext(seq_file)[0]
    label_file = f"{base}_label.csv"

    seq_path = os.path.join(aug_sequence_dir, seq_file)
    label_path = os.path.join(aug_label_dir, label_file)

    # check if label exist
    if not os.path.exists(label_path):
        errors.append(f"Label file missing for {seq_file}")
        continue

    # check sequence format
    try:
        df_seq = pd.read_csv(seq_path)
        if df_seq.shape[1] != 3:
            errors.append(f"{seq_file} should have 3 columns, found {df_seq.shape[1]}")
        if df_seq.isnull().any().any():
            errors.append(f"{seq_file} contains NaN values")
    except Exception as e:
        errors.append(f"Failed to read {seq_file}: {e}")

    # check label format
    try:
        df_label = pd.read_csv(label_path)
        expected_cols = ['gender', 'age', 'action', 'dur']
        if list(df_label.columns) != expected_cols:
            errors.append(f"{label_file} header mismatch. Expected {expected_cols}, got {list(df_label.columns)}")
        if df_label.shape[0] != 1:
            errors.append(f"{label_file} should contain exactly 1 row, found {df_label.shape[0]}")
    except Exception as e:
        errors.append(f"Failed to read {label_file}: {e}")
if errors:
    print("\n".join(errors))
else:
    print("All files passed the format check.")


In [None]:
# check if the duration plausible

errors = []
for filename in sorted(os.listdir(aug_sequence_dir)):
    if not filename.endswith(".csv"):
        continue

    base_id = os.path.splitext(filename)[0]  # A00001
    sequence_path = os.path.join(aug_sequence_dir, filename)
    label_path = os.path.join(aug_label_dir, f"{base_id}_label.csv")

    try:
        seq_df = pd.read_csv(sequence_path)
        label_df = pd.read_csv(label_path)

        num_frames = len(seq_df)
        duration = float(label_df.iloc[0]['dur'])

        expected_duration = num_frames * sampling_interval
        delta = abs(expected_duration - duration)

        if delta > tolerance:
            errors.append((base_id, num_frames, duration, expected_duration, delta))
    except Exception as e:
        print(f"can't load data of {base_id}: {e}")

if errors:
    print("These files' duration in label is implausible：")
    for base_id, frames, label_dur, expected_dur, delta in errors:
        print(f" - {base_id}: num_frames={frames}, duration={label_dur:.2f}s, expexted duration={expected_dur:.2f}s, delta={delta:.2f}s")
else:
    print("All duration in labels are plausible")

# Check feature action

In [None]:
# detect action classes in labels

def collect_action_classes(label_root):
    all_actions = set()
    for filename in sorted(os.listdir(label_root)):
        if filename.endswith('.csv'):
            filepath = os.path.join(label_root, filename)
            try:
                df = pd.read_csv(filepath)
                if 'action' not in df.columns:
                    print(f"no action feature for {filename}")
                    continue
                actions = df['action'].dropna().unique()
                all_actions.update(actions)
            except Exception as e:
                print(f"fail to load {filename}: {e}")
    return sorted(list(all_actions))

original_actions = collect_action_classes(origin_label_dir)
augmented_actions = collect_action_classes(aug_label_dir)

print("in origin dataset：", original_actions)
print("in augmented dataset：", augmented_actions)

In [None]:
# detect num of data sample for specific action class
target_action = 'crawl' 

matched_filenames = []
for filename in sorted(os.listdir(origin_label_dir)):
    if filename.endswith('_label.csv'):
        filepath = os.path.join(origin_label_dir, filename)
        df = pd.read_csv(filepath)

        if target_action in df['action'].values:
            file_id = filename.replace('_label.csv', '')
            matched_filenames.append(file_id)

print(f"finded out {len(matched_filenames)} samples for action class '{target_action}':")
print(matched_filenames)

# Generate the user prompt

In [None]:
import os
import json
import pandas as pd


target_action = 'crawl'
explanation = '婴儿在地板上爬行'

train_txt_path = './data_origin/train.txt'
with open(train_txt_path, 'r') as f:
    train_ids = [line.strip() for line in f if line.strip()]

result_list = []
for seq_id in train_ids:
    label_path = os.path.join(origin_label_dir, f'{seq_id}_label.csv')
    seq_path = os.path.join(origin_sequence_dir, f'{seq_id}.csv')

    if not os.path.exists(label_path) or not os.path.exists(seq_path):
        continue

    label_df = pd.read_csv(label_path)
    if 'action' not in label_df.columns or label_df.iloc[0]['action'] != target_action:
        continue

    data_df = pd.read_csv(seq_path)
    if not all(col in data_df.columns for col in ['accel_x', 'accel_y', 'accel_z']):
        continue

    # calculate mean and std
    mean_series = data_df.mean()
    std_series = data_df.std()

    sample_stat = {
        "id": seq_id,
        "accel_x": f"{mean_series['accel_x']:.9f}±{std_series['accel_x']:.9f}",
        "accel_y": f"{mean_series['accel_y']:.9f}±{std_series['accel_y']:.9f}",
        "accel_z": f"{mean_series['accel_z']:.9f}±{std_series['accel_z']:.9f}"
    }

    result_list.append(sample_stat)

# 打印
print(f"(1)模拟动作：{explanation}(action=\"{target_action}\")")
print(f"(2)统计数据：\n{json.dumps(result_list[:], indent=2, ensure_ascii=False)}")
print("请参考以上信息生成符合要求的数据。")

In [None]:
def generate_blank_csv_files(aug_sequence_dir, aug_label_dir, num_files=150):
    # check max number existed
    existing_files = [
        f for f in os.listdir(aug_sequence_dir) if f.startswith("A") and f.endswith(".csv")
    ]
    max_index = (
        max([int(f[1:6]) for f in existing_files]) + 1 if existing_files else 0
    )

    for i in range(num_files):
        file_id = f"A{max_index + i:05d}"
        sequence_path = os.path.join(aug_sequence_dir, f"{file_id}.csv")
        label_path = os.path.join(aug_label_dir, f"{file_id}_label.csv")
        
        if not os.path.exists(sequence_path):
            with open(sequence_path, 'w') as f:
                pass
        if not os.path.exists(label_path):
            with open(label_path, 'w') as f:
                pass

    print(f"✅ have generated {num_files} blank file pairs:\nfrom A{max_index:05d} to A{max_index + num_files - 1:05d}")

In [None]:
generate_blank_csv_files(aug_sequence_dir, aug_label_dir, num_files=10)