In [32]:
import os
import pandas
from tqdm import tqdm

def get_dir_tree(root_dir):
    tree = {}
    for dirpath, dirnames, filenames in os.walk(root_dir):
        rel_path = os.path.relpath(dirpath, root_dir)
        parent = tree
        if rel_path != ".":
            for part in rel_path.split(os.sep):
                parent = parent.setdefault(part, {})
        parent['files'] = filenames
    return tree

def extract_mp4_files(tree, parent_path=""):
    mp4_list = []
    # まず現在の階層のfilesをチェック
    files = tree.get('files', [])
    for f in files:
        if f.lower().endswith('.mp4'):
            mp4_list.append(os.path.join(parent_path, f) if parent_path else f)
    # サブディレクトリも再帰的に探索
    for key, value in tree.items():
        if isinstance(value, dict) and key != 'files':
            new_parent = os.path.join(parent_path, key) if parent_path else key
            mp4_list.extend(extract_mp4_files(value, new_parent))
    return mp4_list

In [46]:
# P01  P02  P03  P04  P06  P07  P09  P11  P12  P22  P23  P25  P26  P27  P28  P30  P33  P34  P35  P36  P37をp_listに格納
p_list = ["P01", "P02", "P03", "P04", "P06", "P07", "P09", "P11", "P12",
          "P22", "P23", "P25", "P26", "P27", "P28", "P30", "P33", "P34", "P35", "P36", "P37"]
all_files = []
for i in tqdm(p_list):
    tree = get_dir_tree(f"/mnt/vmlqnap02/dataset/EK100/video_ht256px/{i}")
    mp4_files = extract_mp4_files(tree)
    mp4_files = [f.split('_')[1].split('.')[0] for f in mp4_files]
    for j in range(len(mp4_files)):
        mp4_files[j] = f"{i}_{mp4_files[j]}"
    all_files.extend(mp4_files)

print(f"Total MP4 files found: {len(all_files)}")

100%|██████████| 21/21 [00:00<00:00, 938.92it/s]

Total MP4 files found: 268





In [55]:
import pandas as pd
# ./csvの中にあるEPIC_100_retrieval_train.csvを読み込む
df = pd.read_csv("./csv/EPIC_100_retrieval_train.csv")
# video_idの列をall_filesにあるものだけに絞り込む
df_filtered = df[df['video_id'].isin(all_files)]

#df_filteredの80%の行をtrain, 20%の行をtestに分割
train_size = int(len(df_filtered) * 0.8)
df_train = df_filtered[:train_size]
df_test = df_filtered[train_size:] 

# trainとtestをそれぞれcsvに保存
df_train.to_csv("./csv/EPIC_100_retrieval_train_filtered.csv", index=False)
df_test.to_csv("./csv/EPIC_100_retrieval_test_filtered.csv", index=False)

# trainとtestの行数を表示
print(f"Train set size: {len(df_train)}")
print(f"Test set size: {len(df_test)}")
print("Filtered CSV files have been saved.")
print("Process completed successfully.")

Train set size: 29964
Test set size: 7491
Filtered CSV files have been saved.
Process completed successfully.
