# Dataset Ordering Notebook
The goal of this notebook is to order the EgoExo4D dataset from the downloaded structure to one more suitable for training.

## Parse each subfolder's name

In [9]:
import os
from pathlib import Path

def get_subfolders(root_dir):
    subfolder_paths = os.listdir(root_dir)
    return subfolder_paths

subfolders = get_subfolders("/home/thibault/Documents/Code/Models/SkateFormer/assets/train/")

Match the subfolder's name with the corresponding parent task label

In [27]:
import json

def read_json(file_path):
    
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

task_data = read_json("/media/thibault/DATA/these_thibault/Dataset/data/EgoExo4D/annotations/metadata/takes.json")

def match_name_label(data):
    data_dict = {}
    
    for task in data:
        data_dict |= {task.get("take_name"):task.get("parent_task_name")}

    return data_dict

label_data = match_name_label(task_data)


Move the egocentric video to the folder of the corresponding task label.


In [28]:
import shutil
import csv

def move_and_rename_videos(source_dir,dest_dir, pattern, label_data, output_csv):
    # Load the labels from the JSON file

    # List to keep track of the renamed files and their unique IDs and labels
    renamed_files = []

    # Counter for unique IDs
    unique_id = 0

    # Walk through the directory
    for root, dirnames, filenames in os.walk(source_dir):
        for filename in filenames:
            if pattern in filename:
                # Construct full file path
                src_path = os.path.join(root, filename)

                # Extract class name from the path
                class_name = root.split(os.sep)[-2]  # Assumes class_name is two levels up from the file

                # Get the corresponding label
                label = label_data.get(class_name, "Unknown")
                
                task_id = label_data.get("task_id")
                
                parent_task_id = label_data.get("parent_task_id")

                # Create new file name with unique ID
                new_filename = f"{class_name}.mp4"
                
                if label not in os.listdir(dest_dir):
                    os.mkdir(os.path.join(dest_dir, label))
                    
                dest_path = os.path.join(dest_dir, label, new_filename)

                # Move and rename the file
                shutil.copy(src_path, dest_path)

                # Log the original and new file names along with the label
                renamed_files.append((unique_id, filename, new_filename, label))

                # Increment the unique ID
                unique_id += 1

    # Write the changes to a CSV file
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Unique ID", "Original Name", "New Name", "Label"])
        writer.writerows(renamed_files)
        
source_directory = Path('/home/thibault/Documents/Code/Models/SkateFormer/assets/test/')  # The directory containing the videos
pattern = '_214'
labels_json_path = '/media/thibault/DATA/these_thibault/Dataset/data/EgoExo4D/annotations/labels.json'  # Path to the JSON file containing class labels
output_csv_file = source_directory /'metadata.csv'
dest_path = "/media/thibault/DATA/these_thibault/Dataset/data/EgoExo4D/dataset/test/"

move_and_rename_videos(source_directory, dest_path, pattern, label_data, output_csv_file)
