# EgoExo Manifest Generator

Generates a set of EgoExo manifest files from the given s3 directory using smart defaults.

### Pre-requisites:
- Jupyter kernel with boto3 and pandas installed
- You have run `aws configure` and set up your AWS keys
- (Highly recommended) Recorded files are already uploaded to s3
- ROOT_DIR has the structure: `<ROOT_DIR>/<take_id>/(ego|exo)/<camera_id>/<video files>`

### How To Use:
1. Set the variables in the cell below.
2. Run all. This will output manifest files to `<OUTPUT_DIRECTORY>/manifest_<MANIFEST_VERSION>/`.
3. Edit the .csv files to add additional metadata and ensure correctness.
4. Continue the instructions in the [Post-Capture Processing Doc](https://docs.google.com/document/d/1QY3GFSpia9kvFJVvzsRic6Wx0Qya-RiicIrSObi6vZY/edit?usp=sharing)

The following files are generated:
- take_metadata.csv
- video_metadata.csv
- video_component_file.csv
- validation_output.json

Note: It is **safe** to modify manifest files after generation and re-run.
This notebook will retain existing rows, and only add new ones that didn't exist.

Note: Most cells in this notebook are uneditable for safety. Set `"editable": true` in a cell's metadata if you want customize it.

In [None]:
ROOT_DIR = "<full s3 path to root data directory>" # e.g. s3://<university_bucket>/egoexo/data/
WALKAROUND_CAM_NAMES = ["mobile"] # Exo Cameras that will always have walkarounds
EGO_HAS_WALKAROUND = True # True if Ego Cameras all have walkarounds (this should be the case)
OUTPUT_DIRECTORY = "<full path to output directory here>" # e.g. /Users/<username>/egoexo/
MANIFEST_VERSION = "v1"

In [None]:
# Imports
import os
import boto3
import pandas as pd

In [None]:
# Utilities
s3 = boto3.resource("s3")

def is_s3_path(path):
    return path.startswith('s3://')

def split_s3_path(s3_path):
    path_parts = s3_path.strip().replace("s3://", "").split("/")
    bucket = path_parts.pop(0)
    key = "/".join(path_parts)
    return bucket, key

def ls(path):
    if is_s3_path(path):
        bucket, key = split_s3_path(path)
        if not key.endswith("/"):
            key = key + "/"
        keys = [x.key for x in s3.Bucket(bucket).objects.filter(Prefix=key)]
        return list(set([x.replace(key, '').split('/')[0] for x in keys]))
    if os.path.isfile(path):
        return None
    return [f for f in os.listdir(path)]

def is_video(path):
    filename = os.path.basename(path)
    return filename.lower().endswith(".mp4") or filename.lower().endswith(".vrs")

## Generate take_metadata.csv

In [None]:
take_ids = ls(ROOT_DIR)
take_ids

In [None]:
clean_take_ids = [
    x for x in take_ids if len(set(['ego', 'exo']).intersection(set(ls(os.path.join(ROOT_DIR, x))))) > 0
]
clean_take_ids

In [None]:
take_metadata_records = []
for take_id in clean_take_ids:
    root = os.path.join(ROOT_DIR, take_id)
    root_ls = ls(root)
    ego_cameras = (
        ls(os.path.join(root, "ego")) if "ego" in root_ls else []
    )
    exo_cameras = (
        ls(os.path.join(root, "exo")) if "exo" in root_ls else []
    )

    video_count = 0
    for cam in ego_cameras:
        video_count += len(
            [x for x in ls(os.path.join(root, "ego", cam)) if is_video(x)]
        )
    for cam in exo_cameras:
        video_count += len(
            [x for x in ls(os.path.join(root, "exo", cam)) if is_video(x)]
        )

    take_metadata_records.append(
        {
            "university_take_id": take_id,
            "university_video_folder_path": os.path.join(ROOT_DIR, take_id) if is_s3_path(ROOT_DIR) else "s3://please-upload-me",
            "number_videos": video_count,
            "recording_participant_id": None,
            "physical_setting_id": None,
            "video_scenario_ids": None,
        }
    )

take_metadata = pd.DataFrame.from_records(take_metadata_records)
take_metadata

## Generate video_metadata.csv

In [None]:
video_metadata_records = []
for take_id in clean_take_ids:
    root = os.path.join(ROOT_DIR, take_id)
    root_ls = ls(root)
    ego_cameras = (
        ls(os.path.join(root, "ego")) if "ego" in root_ls else []
    )
    exo_cameras = (
        ls(os.path.join(root, "exo")) if "exo" in root_ls else []
    )

    video_count = 0
    for cam in ego_cameras:
        cam_root = os.path.join(root, "ego", cam)
        video_metadata_records.append(
            {
                "university_take_id": take_id,
                "university_video_id": cam,
                "number_video_components": len(
                    [x for x in ls(cam_root) if is_video(x)]
                ),
                "is_ego": True,
                "has_walkaround": EGO_HAS_WALKAROUND,
                "is_redacted": False,
                "includes_audio": True,
                "device_id": cam,
                "recording_participant_id": None,
                "video_device_settings": None,
            }
        )

    for cam in exo_cameras:
        cam_root = os.path.join(root, "exo", cam)
        video_metadata_records.append(
            {
                "university_take_id": take_id,
                "university_video_id": cam,
                "number_video_components": len(
                    [x for x in ls(cam_root) if is_video(x)]
                ),
                "is_ego": False,
                "has_walkaround": cam in WALKAROUND_CAM_NAMES,
                "is_redacted": False,
                "includes_audio": True,
                "device_id": cam,
                "recording_participant_id": None,
                "video_device_settings": None,
            }
        )
        
    # Remove any 'cameras' with 0 videos. This removes extraneous files like .DS_STORE
    video_metadata_records = [x for x in video_metadata_records if x['number_video_components'] > 0]

video_metadata = pd.DataFrame.from_records(video_metadata_records)
video_metadata

## Generate video_component_file.csv

In [None]:
video_component_file_records = []
for take_id in clean_take_ids:
    root = os.path.join(ROOT_DIR, take_id)
    root_ls = ls(root)
    ego_cameras = (
        ls(os.path.join(root, "ego")) if "ego" in root_ls else []
    )
    exo_cameras = (
        ls(os.path.join(root, "exo")) if "exo" in root_ls else []
    )

    video_count = 0
    for cam in ego_cameras:
        cam_root = os.path.join(root, "ego", cam)
        for i, filename in enumerate(
            sorted([x for x in ls(cam_root) if is_video(x)])
        ):
            video_component_file_records.append(
                {
                    "university_take_id": take_id,
                    "university_video_id": cam,
                    "video_component_relative_path": f"ego/{cam}/{filename}",
                    "component_index": i,
                    "is_redacted": False,
                    "component_metadata": None,
                    "deidentification_metadata": None,
                }
            )

    for cam in exo_cameras:
        cam_root = os.path.join(root, "exo", cam)
        for i, filename in enumerate(
            sorted([x for x in ls(cam_root) if is_video(x)])
        ):
            video_component_file_records.append(
                {
                    "university_take_id": take_id,
                    "university_video_id": cam,
                    "video_component_relative_path": f"exo/{cam}/{filename}",
                    "component_index": i,
                    "is_redacted": False,
                    "component_metadata": None,
                    "deidentification_metadata": None,
                }
            )

video_component_file = pd.DataFrame.from_records(video_component_file_records)
video_component_file

## Write new manifest files to Output Directory

In [None]:
manifest_dir = os.path.join(OUTPUT_DIRECTORY, f"manifest_{MANIFEST_VERSION}")
file_dfs = {
    "take_metadata.csv": {
        "data": take_metadata,
        "primary_keys": ["university_take_id"]
    },
    "video_metadata.csv": {
        "data": video_metadata,
        "primary_keys": ["university_take_id", "university_video_id"]
    },
    "video_component_file.csv": {
        "data": video_component_file,
        "primary_keys": ["university_take_id", "university_video_id", "video_component_relative_path"]
    }
}

os.makedirs(manifest_dir, exist_ok = True)
for filename in file_dfs.keys():
    filepath = os.path.join(manifest_dir, filename)
    data, primary_keys = file_dfs[filename]['data'], file_dfs[filename]['primary_keys']

    # Filter out rows that already exist in this folder
    # based on the dataframe's primary keys
    if os.path.isfile(filepath):
        assert len([c for c in data.columns if c.endswith('_y') or c.endswith('_x')]) == 0, "Please rename columns ending in _x or _y"
        
        preexisting_data = pd.read_csv(filepath)
        data = pd.merge(data, preexisting_data, on=primary_keys, how="outer", indicator=True
              ).query('_merge=="left_only"')
        # Fix column names post-merge and drop merge-specific columns
        data = data.drop(columns = ['_merge'] + [c for c in data.columns if c.endswith('_y')])
        data = data.rename(columns={
            c: c[:-2]
            for c in data.columns if c.endswith('_x')
        })
        data = pd.concat([preexisting_data, data])
        print(f"Retained pre-existing data for {filename}")
    
    with open(filepath, "w") as f:
        data.to_csv(f, index=False)

print(f"Successfully wrote {len(file_dfs)} files to {manifest_dir}")