## Preparation of Ego2Exo Dataset for Video Domain Adaptation

In [8]:
import json
import os
from collections import defaultdict, Counter
import random
import matplotlib.pyplot as plt
import numpy as np

In [2]:
root_dir = "/newdata/tarun/datasets/ego4d_new/"

In [33]:
import hashlib

def get_hash(input_string, size=8):
    # Generate the hash and truncate it to 8 characters
    hash_hex = hashlib.md5(input_string.encode()).hexdigest()[:size]
    # Convert the hexadecimal string to an integer
    return int(hash_hex, 16)

def valid_exo(names):
    return [n for n in names if n.startswith(("cam","gp01","gp02","gp03","gp04","gp06"))]

In [5]:
with open("take_ids.txt") as fh:
    takes = list(map(lambda v:v.strip(), fh.readlines()))

## Extract text for all segments

In [12]:
## arrange the keysteps into <take_uid:list(segments)>

keystep_path = os.path.join(root_dir, "annotations/keystep_train.json")
keysteps_json = json.load(open(keystep_path))['annotations']

keystep_segments = {}
for k in takes:
    assert k not in keystep_segments
    keystep_segments[k] = keysteps_json[k]['segments']

In [13]:
## arrange the atomic descriptions into <take_uid:dict(annotator_id:list(desc))>

atomic_path = os.path.join(root_dir, "annotations/atomic_descriptions_train.json") 
atomic_json = json.load(open(atomic_path))['annotations']

atomic_annotations = {}
for k in takes:
    annotator_dict = {}
    for annotations in atomic_json[k]:
        aid = annotations['annotator_id']
        desc = annotations['descriptions']
        annotator_dict[aid] = desc
    atomic_annotations[k] = annotator_dict

### Match the keysteps to atomic annotations takewise. for each annotator from each take, do the following:
1. give the segment a unique id: this is a hash of the take and the segment_id within the take.
2. identify the start and end times of the segment.
3. from the atomic annotations, find all the instances which falls between this start and end time.
4. concatenate these statements end to end - this forms the textual description of the keystep per annotator.
5. if there is no atomic action description between the start and end times of a keystep from any of the annotator, then the textual description of that keystep would be the concatenation of the action descriptions from closest before and after action descriptions from any of the randomly sampled annotator. 
6. note that a single atomic descrption can be matched to multiple keysteps - since the keysteps segments can be overlapping. But this happens rarely, like 75/4262 times or lesser.
7. To fuse texts from different annotators, we just concatenate them end to end - but there should be better ways of doing this. 

In [14]:
segment_to_text = defaultdict(list) ## dict with list values
segment_to_meta = dict()
segment_to_annotation = dict()
take_uid = takes[0] 

for take_uid in takes:
    segment_list = keystep_segments[take_uid] ## list
    annotator_dict = atomic_annotations[take_uid] ## dict: ann_id:list of descriptions.

    for idx, segment in enumerate(segment_list):
        segment_hash = str(get_hash(str(take_uid) + str(idx)))
        segment_to_annotation[segment_hash] = segment
        start_time = segment['start_time']
        end_time = segment['end_time']
        
        text_dict = dict()
        best_exo = []
        for aid in annotator_dict:
            ann_desc = annotator_dict[aid]
            ann_text = []
            
            for desc in ann_desc:
                if start_time <= desc['timestamp'] <= end_time:
                    ## this atomic description falls inside the segment - so should correspond to the action.
                    ann_text.append(desc['text'][2:]) ## Ignore the subject ID.
                    best_exo.append(desc['best_exo'])
            if len(ann_text):
                ann_text = " ".join(ann_text)
                text_dict[aid] = ann_text
        
        len_text = sum([len(v) for v in text_dict.values()])
        if len_text == 0:
            aid = random.sample(list(annotator_dict.keys()),1)[0]
            ann_desc = annotator_dict[aid]
            ann_text = []
            for jdx in range(len(ann_desc)):
                if jdx==(len(ann_desc)-1) or (ann_desc[jdx]['timestamp'] < start_time and ann_desc[jdx+1]['timestamp'] > start_time):
                    ann_text.append(ann_desc[jdx]['text'][2:])
                    best_exo.append(ann_desc[jdx]['best_exo'])
                    break
            if len(ann_desc) and (ann_desc[0]['timestamp'] > end_time):
                    ann_text.append(ann_desc[0]['text'][2:])
                    best_exo.append(ann_desc[0]['best_exo'])
            else:
                for jdx in range(1,len(ann_desc)):
                    if (ann_desc[jdx]['timestamp'] > end_time and ann_desc[jdx-1]['timestamp'] < end_time):
                        ann_text.append(ann_desc[jdx]['text'][2:])
                        best_exo.append(ann_desc[jdx]['best_exo'])
                        break
            
            if len(ann_text):
                text_dict[aid] = " ".join(ann_text)
                            
        if len(best_exo) == 0:
            ## If no exo-view recorded for an ego-view segment, skip the segment. 
            continue
        segment_to_text[segment_hash] = text_dict 
        segment.update({
            'take_uid'  : take_uid,
            'segment_id': idx,
            'best_exo' : random.sample(best_exo,1)[0],# if len(best_exo) >= 1 else {}
#             'domain'   : take_uid_to_name[take_uid]
        })
        
        segment_to_meta[segment_hash] = segment

## Extract labels at L1 hierarchy for all the segments

In [15]:
keysteps = json.load(open(keystep_path))
annotations = keysteps['annotations']
taxonomy = keysteps['taxonomy']

cooking = ['Making Coffee latte',
 'Making Cucumber & Tomato Salad',
 'Cooking Scrambled Eggs',
 'Cooking an Omelet',
 'Making Milk Tea',
 'Making Sesame-Ginger Asian Salad',
 'Cooking Tomato & Eggs',
 'Making Chai Tea',
 'Cooking Noodles',
 'Cooking Sushi Rolls',
 'Cooking Pasta']

scenario_wise_mapping = {}
for scenario in cooking:
    
    ## compute the parent tree.
    idx_to_parent = {}
    child_ids = []
    idx_to_name = {}
    for c,m in taxonomy[scenario].items():
        idx_to_name[m['id']] = m['name']
        idx_to_parent[m['id']] = m['parent_id']
        if m['is_leafnode']:
            child_ids.append(m['id'])
    
    ## map the child labels to the parents.
    mapping = {}
    for cid in child_ids:
        curr = cid
        while idx_to_parent[curr] != 0:
            curr = idx_to_parent[curr]
        mapping[cid] = idx_to_name[curr]
        
    scenario_wise_mapping[scenario] = mapping

In [16]:
segment_to_ann = {}
all_labels = []
for take_uid in takes:
    ann = annotations[take_uid]
    for idx, seg in enumerate(ann["segments"]):
        if seg['step_id'] >= 10000:
            ## For some reason, some have this wierd seg_id, skip these.
            continue
        segment_hash = str(get_hash(str(take_uid) + str(idx)))
        label_remapped_name = scenario_wise_mapping[ann["scenario"]][seg['step_id']]
        all_labels.append(label_remapped_name)
        seg.update({
            'l1_label_name' : label_remapped_name,
            'take_uid'      : take_uid,
            'segment_id'    : idx,
            'scenario'      : ann['scenario']
        })
        segment_to_ann[segment_hash] = seg

In [22]:
## Selected based on label frequency and distribution between train and validation sets.
valid_labels = ['Clean up',
 'Prepare milk (boiled)',
 'Prepare ingredients',
 'Construct undressed salad',
 'Make dough',
 'Brew coffee (instant coffee)',
 'Prepare a skillet',
 'Make chai tea',
 'Make pasta',
 'Cook noodles in a skillet',
 'Get kitchenware & utensils',
 'Turn off the stove',
 'Check paper recipe',
 'Boil noodles in boiling water',
 'Brew coffee (manual pour-over)',
 'Make milk tea',
 'Mix noodles with sauce in a bowl',
 'Make salad',
 'Serve',
 'Cook',
 'Add spring onions',
 'Get Ingredients',
 'Add water',
 'Prepare dressing']

len(set(valid_labels))

24

In [23]:
idx_to_label = {idx:l for idx,l in enumerate(valid_labels)}
label_to_idx = {v:k for k,v in idx_to_label.items()}

## Create the json files.

In [24]:
json_data = {}

In [25]:
categories = [{"category_name":cname,  "category_id":idx} for idx, cname in idx_to_label.items()]
json_data['categories'] = categories

In [27]:
label_to_seg = defaultdict(list)
for seg_id, ann in segment_to_ann.items():
    if seg_id not in segment_to_meta:
        continue
    if ann['l1_label_name'] in valid_labels:
        label_to_seg[ann['l1_label_name']].append(seg_id)

In [30]:
source_ids = []
target_ids = []
for segments in label_to_seg.values():
    source = random.sample(list(segments), int(len(segments)*0.55))
    target = [s for s in segments if s not in source]
    
    source_ids.extend(source)
    target_ids.extend(target)
seg_ids = {
    "source" : source_ids,
    "target" : target_ids
}

In [31]:
fid_to_filename = {k:"file.jpg" for k in segment_to_ann}
fid_to_category = {k:v['l1_label_name'] for k,v in segment_to_ann.items()}
fid_to_label = {k:label_to_idx.get(v['l1_label_name'],-1) for k,v in segment_to_ann.items()}

In [32]:
for domain in ['source', 'target']:
    
        ## images
        all_ids = seg_ids[domain]

        clips = []

        for fid in all_ids:
            clips.append({
                "filename" : fid_to_filename[fid],
                "id"       : int(fid),
            })

        ## annotations

        anns = []

        for fid in all_ids:

            anns.append({
                "segment_id" : int(fid),
                "category" : fid_to_label[fid],
                'class_name' : fid_to_category[fid]
            })

        ## metadata

        meta = []

        for fid in all_ids:
            meta_dict = segment_to_meta[fid]
            fid = int(fid)
            

            meta.append({
                'segment_id'    : fid,
                'start_time'  : meta_dict['start_time'],
                'end_time'    : meta_dict['end_time'],
                'take_uid'    : meta_dict['take_uid'],
                'segment_index' : meta_dict['segment_id'],
                'best_exo'    : meta_dict['best_exo']
            })
            
        ## text description
        
        text_desc = []
        
        for fid in all_ids:
            segment_id = int(fid)
            final_text = ""
            for text in segment_to_text[fid].values():
                final_text += text
            text_dict = {
                'segment_id' : segment_id,
                'text_caption' : final_text,
                'annotator_texts' : {}
            }
            for ann_id, text in segment_to_text[fid].items():
                text_dict['annotator_texts'][ann_id] = text
            
            text_desc.append(text_dict)

        json_data["{}".format(domain)] = {
            "clips" : clips,
            "annotations" : anns,
            "metadata"    : meta,
            "descriptions" : text_desc
        }

## Add Validation Data

In [37]:
take_path = os.path.join(root_dir, "takes.json")
takes = json.load(open(take_path))
uid_to_take = {t["take_uid"]:t for t in takes}

In [38]:
val_keystep_path = os.path.join(root_dir, "annotations/keystep_val.json")
val_annotations = json.load(open(val_keystep_path))['annotations']
val_annotations = {k:v for k,v in val_annotations.items() if v['scenario'] in cooking}

In [41]:
scenario_wise_mapping_pruned = {}
for k,v in scenario_wise_mapping.items():
    scenario_wise_mapping_pruned[k] = {key:val for key,val in v.items() if val in valid_labels}

In [43]:
seg_to_ann = {}
for take_uid, ann in val_annotations.items():
    scenario = ann['scenario']

    exos = valid_exo(uid_to_take[take_uid]['frame_aligned_videos'].keys())
    random_exos = random.sample(exos,1)[0]
    for idx, segment in enumerate(ann["segments"]):
        label_name = scenario_wise_mapping_pruned[scenario].get(segment['step_id'],None)
        if label_name is not None:
            seg_hash = str(get_hash(str(take_uid) + str(idx)))
            segment.update({
                'l1_label_name' : label_name,
                'l1_label_id'   : label_to_idx[label_name],
                'take_uid'      : take_uid,
                'segment_index'    : idx,
                'scenario'      : ann['scenario'],
                'best_exo'      : {'cam_id':random_exos, 'raw_cam_id':str(int(random_exos[-2:]))}
            })
            seg_to_ann[seg_hash] = segment

In [44]:
fid_to_filename = {k:"file.jpg" for k in seg_to_ann}
fid_to_category = {k:v['l1_label_name'] for k,v in seg_to_ann.items()}
fid_to_label = {k:v['l1_label_id'] for k,v in seg_to_ann.items()}

In [45]:
## images
all_ids = list(seg_to_ann.keys())

clips = []

for fid in all_ids:
    clips.append({
        "filename" : fid_to_filename[fid],
        "id"       : int(fid),
    })

## annotations

anns = []

for fid in all_ids:

    anns.append({
        "segment_id" : int(fid),
        "category" : fid_to_label[fid],
        'class_name' : fid_to_category[fid]
    })

## metadata

meta = []

for fid in all_ids:
    meta_dict = seg_to_ann[fid]
    fid = int(fid)


    meta.append({
        'segment_id'    : fid,
        'start_time'  : meta_dict['start_time'],
        'end_time'    : meta_dict['end_time'],
        'take_uid'    : meta_dict['take_uid'],
        'segment_index' : meta_dict['segment_index'],
        'best_exo'    : meta_dict['best_exo']
    })
    
text_desc = []
        
for fid in all_ids:
    segment_id = int(fid)
    final_text = ""
    text_dict = {
        'segment_id' : segment_id,
        'text_caption' : final_text,
        'annotator_texts' : {}
    }

    text_desc.append(text_dict)
    
    
json_data['source_val'] = {
            "clips" : clips,
            "annotations" : anns,
            "metadata"    : meta,
            "descriptions" : text_desc
        }


json_data['target_val'] = {
            "clips" : clips,
            "annotations" : anns,
            "metadata"    : meta,
            "descriptions" : text_desc
}

In [48]:
ego2exo = {
    'categories' : json_data['categories'],
    'exo_train'  : json_data['source'],
    'exo_val'    : json_data['source_val'],
    'ego_train'  : json_data['target'],
    'ego_val'    : json_data['target_val'],
}

## Add training frames and segment_ids

In [49]:
def get_index_mapping(n_frames, window_size=32, stride=16, fps=30, frames=True):
    """
    Given a video of length T, window W and stride S, generate mapping between index and the feature window used
    to compute the features.
    mapping: dict(key:idx, val:(start_frame, end_frame))
    So, the feature at position idx is computed using the frames from (start_frame, end_frame).
    """
    ## 0-31 -> 1, 16-47 -> 2, ....
    ## also add a loop-back for edge cases
    idx = 0
    starting_frame, ending_frame = 0, window_size-1
    mapping = dict()
    while ending_frame < n_frames:
        mapping[idx] = (starting_frame, ending_frame)
        starting_frame += stride
        ending_frame = starting_frame + window_size-1
        idx += 1
    if n_frames % stride != 0:
        ending_frame = n_frames-1
        starting_frame = ending_frame - (window_size-1)
        mapping[idx] = (starting_frame, ending_frame)
    if not frames:
        mapping = {k:(v[0]/fps,v[1]/fps) for k,v in mapping.items()}
    return mapping

def get_matching_indices(start, end, frame_idx, sec=True, fps=30):
    """
    Given a start and end time of a video clip, find what feature_ids correspond to that particular clip.
    There can be more than one feature_idx, so we return the list of all such indices.
    """
    if sec:
        start = start*fps
        end = end*fps
    
    matching_list = []
        
    idx = 0
    while frame_idx[idx][1] < start:
        idx += 1

    while (idx < len(frame_idx)) and (frame_idx[idx][0] < end) :
        matching_list.append(idx)
        idx += 1
    
    return matching_list

def get_frame_indices(start, end, window_size=32, stride=16, sec=True, fps=30):
    """
    Given start and end times of a video clip and a window size and stride, this outputs a list of frame 
    boundaries which have to be forward passed for computing feature for that clip.
    """
    if sec:
        start = int(start*fps)
        end = int(end*fps)
        
    frame_indices = []
    
    curr_start = start
    curr_end = start+(window_size-1)
    
#     frame_indices.append(curr_start)
    
    while curr_end <= end:
        frame_indices.append((curr_start,curr_end))
        curr_start += stride
        if curr_end == end:
            break
        curr_end = curr_start + (window_size-1)
        
    if curr_end > end:
        curr_end = end
        curr_start = max(0, curr_end - (window_size-1))
        frame_indices.append((curr_start, curr_end))
    
    return frame_indices

In [50]:
uid_to_frame = {t["take_uid"]:get_index_mapping(int(t['duration_sec'] * 30)) for t in takes}

In [54]:
feature_root = "features/omnivore_video/"
for split in ['ego_train', 'ego_val', 'exo_train', 'exo_val']:
    
    segid_to_meta = {m['segment_id']:m for m in ego2exo[split]["metadata"]}
    
    features = []
    for segid,meta in segid_to_meta.items():

        take_uid = meta["take_uid"]
        if take_uid not in uid_to_take:
#             print(take_uid)
            continue

        if "ego" in split:
            all_cameras = uid_to_take[take_uid]["frame_aligned_videos"].keys()
            ego_camera = [a for a in all_cameras if "aria" in a or "gp05" in a]
            ego_camera = random.choice(ego_camera)
            if "aria" in ego_camera:
                cam = "{}_{}".format(ego_camera, "rgb")
                stream_info = uid_to_take[take_uid]["frame_aligned_videos"][ego_camera]['rgb']
                filepath = "{}/frame_aligned_videos/downscaled/448/{}_{}.mp4".format(uid_to_take[take_uid]["root_dir"], stream_info['cam_id'], stream_info['stream_id'])
            elif "gp05" in ego_camera:
                cam = "{}_0".format(ego_camera)
                stream_info = uid_to_take[take_uid]["frame_aligned_videos"][ego_camera]['0']
                filepath = "{}/frame_aligned_videos/downscaled/448/{}.mp4".format(uid_to_take[take_uid]["root_dir"], stream_info['cam_id'])
            else:
                raise ValueError
        else:
            if (meta['best_exo'] is None) or (meta['best_exo']['cam_id'] is None):
                ## choose a random id
                all_cameras = uid_to_take[take_uid]["frame_aligned_videos"].keys()
                exo_cameras = [a for a in all_cameras if a.startswith(("gp01","gp02","gp03","gp04","gp06","cam"))]
                exo_choice = random.sample(exo_cameras,1)[0]
                cam = "{}_0".format(exo_choice)
            else:
                ## choose the best id
                cam = "{}_0".format(meta['best_exo']['cam_id']) 
                exo_choice = meta['best_exo']['cam_id']
                if "gp05" == exo_choice:
                    exo_choice = "gp04"
            stream_info = uid_to_take[take_uid]["frame_aligned_videos"][exo_choice]['0']
            filepath = "{}/frame_aligned_videos/downscaled/448/{}.mp4".format(uid_to_take[take_uid]["root_dir"], stream_info['cam_id'])

        features.append({
            'id' : segid,
            'video_file_name': filepath,
            'feature_file_name' : os.path.join(feature_root, "{}_{}.pt".format(take_uid,cam)),
            'feature_indices' : get_matching_indices(meta["start_time"], meta["end_time"], uid_to_frame[meta["take_uid"]], sec=True),
            'frame_indices' : get_frame_indices(meta["start_time"], meta["end_time"], sec=True)
        })
    
    print("{},{}".format(split, len(features)))
    ego2exo[split]['clips'] = features

ego_train,4100
ego_val,3168
exo_train,4986
exo_val,3168


In [56]:
## normalize segments across all the domains and data.
for split in ['ego_train', 'ego_val', 'exo_train', 'exo_val']:
    print(split)
    valid_ids = [c["id"] for c in ego2exo[split]["clips"]]
    
    for ks in ego2exo[split].keys():
        if ks == "clips":
            continue
        old_list = ego2exo[split][ks]
        new_list = [o for o in old_list if o["segment_id"] in valid_ids]
        ego2exo[split][ks] = new_list
        assert len(new_list) == len(ego2exo[split]['clips'])

ego_train
ego_val
exo_train
exo_val


In [57]:
with open("ego2exo.json", "w") as fh:
    json.dump(ego2exo, fh, indent=4)