In [None]:
import json
import torch
import random
import os

In [None]:
def get_index_mapping(n_frames, window_size=32, stride=16, fps=30, frames=True):
    """
    Given a video of length T, window W and stride S, generate mapping between index and the feature window used
    to compute the features.
    mapping: dict(key:idx, val:(start_frame, end_frame))
    So, the feature at position idx is computed using the frames from (start_frame, end_frame).
    """
    ## 0-31 -> 1, 16-47 -> 2, ....
    ## also add a loop-back for edge cases
    idx = 0
    starting_frame, ending_frame = 0, window_size-1
    mapping = dict()
    while ending_frame < n_frames:
        mapping[idx] = (starting_frame, ending_frame)
        starting_frame += stride
        ending_frame = starting_frame + window_size-1
        idx += 1
    if n_frames % stride != 0:
        ending_frame = n_frames-1
        starting_frame = ending_frame - (window_size-1)
        mapping[idx] = (starting_frame, ending_frame)
    if not frames:
        mapping = {k:(v[0]/fps,v[1]/fps) for k,v in mapping.items()}
    return mapping

def get_matching_indices(start, end, frame_idx, sec=True, fps=30):
    """
    Given a start and end time of a video clip, find what feature_ids correspond to that particular clip.
    There can be more than one feature_idx, so we return the list of all such indices.
    """
    if sec:
        start = start*fps
        end = end*fps
    
    matching_list = []
        
    idx = 0
    while frame_idx[idx][1] < start:
        idx += 1

    while (idx < len(frame_idx)) and (frame_idx[idx][0] < end) :
        matching_list.append(idx)
        idx += 1
    
    return matching_list

def get_frame_indices(start, end, window_size=32, stride=16, sec=True, fps=30):
    """
    Given start and end times of a video clip and a window size and stride, this outputs a list of frame 
    boundaries which have to be forward passed for computing feature for that clip.
    """
    if sec:
        start = int(start*fps)
        end = int(end*fps)
        
    frame_indices = []
    
    curr_start = start
    curr_end = start+(window_size-1)
    
#     frame_indices.append(curr_start)
    
    while curr_end <= end:
        frame_indices.append((curr_start,curr_end))
        curr_start += stride
        if curr_end == end:
            break
        curr_end = curr_start + (window_size-1)
        
    if curr_end > end:
        curr_end = end
        curr_start = max(0, curr_end - (window_size-1))
        frame_indices.append((curr_start, curr_end))
    
    return frame_indices

In [None]:
takes = json.load(open("/newdata/tarun/datasets/ego4d/takes.json"))
uid_to_take = {t["take_uid"]:t for t in takes}

In [None]:
uid_to_frame = {t["take_uid"]:get_index_mapping(int(t['duration_sec'] * 30)) for t in takes}

In [None]:
feature_root = "features/omnivore_video/"
data_file = "../metadata/ego4d_cooking.json"

In [None]:
ego4d = json.load(open(data_file))

In [None]:
for split in ['ego_train', 'ego_val', 'exo_train', 'exo_val', 'ego_train_extra', 'exo_train_extra']:
    
    segid_to_meta = {m['segment_id']:m for m in ego4d[split]["metadata"]}
    
    features = []
    for segid,meta in segid_to_meta.items():

        take_uid = meta["take_uid"]

        if "ego" in split:
            all_cameras = uid_to_take[take_uid]["frame_aligned_videos"].keys()
            ego_camera = [a for a in all_cameras if "aria" in a][0]
            cam = "{}_{}".format(ego_camera, "rgb")
            stream_info = uid_to_take[take_uid]["frame_aligned_videos"][ego_camera]['rgb']
            filepath = "takes/{}/frame_aligned_videos/downscaled/448/{}_{}.mp4".format(uid_to_take[take_uid]["root_dir"], stream_info['cam_id'], stream_info['stream_id'])
        else:
            if (meta['best_exo'] is None) or (meta['best_exo']['cam_id'] is None):
                ## choose a random id
                all_cameras = uid_to_take[take_uid]["frame_aligned_videos"].keys()
                exo_cameras = [a for a in all_cameras if a.startswith(("gp","cam"))]
                exo_choice = random.sample(exo_cameras,1)[0]
                cam = "{}_0".format(exo_choice)
            else:
                ## choose the best id
                cam = "{}_0".format(meta['best_exo']['cam_id']) 
                exo_choice = meta['best_exo']['cam_id']
            stream_info = uid_to_take[take_uid]["frame_aligned_videos"][exo_choice]['0']
            filepath = "takes/{}/frame_aligned_videos/downscaled/448/{}.mp4".format(uid_to_take[take_uid]["root_dir"], stream_info['cam_id'])

        features.append({
            'id' : segid,
            'video_file_name': filepath,
            'feature_file_name' : os.path.join(feature_root, "{}_{}.pt".format(take_uid,cam)),
            'feature_indices' : get_matching_indices(meta["start_time"], meta["end_time"], uid_to_frame[meta["take_uid"]], sec=True),
            'frame_indices' : get_frame_indices(meta["start_time"], meta["end_time"], sec=True)
        })

    ego4d[split]['clips'] = features

In [None]:
with open(data_file, "w") as fh:
    json.dump(ego4d, fh, indent=4)

### verification

In [1]:
import torch
import json

In [None]:
takes = json.load(open("/newdata/tarun/datasets/ego4d/takes.json"))
id_to_take = {t["take_uid"]:t for t in takes}

In [None]:
id_to_take

In [2]:
ego4d = json.load(open("../metadata/ego4d_cooking.json"))

In [None]:
files = [f["feature_file_name"].split("_")[-2] for f in ego4d["ego_train_extra"]["clips"]]

In [None]:
for f in ego4d["exo_train"]["clips"]:
    if "gp" in f["feature_file_name"].split("_")[-2]:
        print(f["video_file_name"])

In [None]:
set(files)

In [None]:
ego4d["exo_train_extra"]["clips"][10]

In [None]:
for split in ['ego_train', 'ego_val', 'exo_train', 'exo_val', 'ego_train_extra', 'exo_train_extra']:
    files = ego4d[split]['clips']
#     print(len(files))
    for f in files:
        if not os.path.exists(os.path.join("/newdata/tarun/datasets/ego4d/", f['video_file_name'])):
            print(f)
            break
            
        if not os.path.exists(os.path.join("/newdata/tarun/datasets/ego4d/", f['feature_file_name'])):
            print(f)
            break
        
        feat = torch.load(os.path.join("/newdata/tarun/datasets/ego4d/", f['feature_file_name']))
        assert len(f['feature_indices']) >= 1
        assert len(f['frame_indices']) >= 1
        index_tensor = torch.tensor(f['feature_indices'], dtype=torch.long)
#         print(feat[0:].shape)
        segment_feat = feat[index_tensor].mean(0).squeeze()
        assert len(segment_feat) >= 1
#         print(segment_feat.shape)

In [None]:
uid_to_featfile = {}
for split in ['ego_train', 'ego_val', 'exo_train', 'exo_val', 'ego_train_extra', 'exo_train_extra']:
    files = ego4d[split]['clips']
    uid_to_feat = {f['feature_file_name'].split("/")[-1].split("_")[0]:f['feature_file_name'] for f in files}
    uid_to_featfile.update(uid_to_feat)

from tqdm import tqdm
for t in tqdm(uid_to_featfile):
    frame_len = len(uid_to_frame[t])
    filename = uid_to_featfile[t]
    feat = torch.load(os.path.join("/newdata/tarun/datasets/ego4d/", filename))
    assert len(feat) == frame_len

In [None]:
for split in ['ego_train', 'ego_val', 'exo_train', 'exo_val', 'ego_train_extra', 'exo_train_extra']:
    keys = list(ego4d[split].keys())
    key_lens = [len(ego4d[split][k]) for k in keys]
    assert len(list(set(key_lens))) == 1

In [None]:
meta = ego4d['exo_val']['metadata']

In [None]:
duration = [(m["end_time"] - m["start_time"])*30 for m in meta]

In [None]:
sorted(duration)

In [None]:
for m in meta:
    if m["end_time"] == m["start_time"]:
        print(m)

In [None]:
for c in ego4d['ego_train_extra']['clips']:
    if c["id"] == 1908762465:
        print(c)

In [8]:
all_videos = []
for split in ['ego_train', 'ego_val', 'exo_train', 'exo_val', 'ego_train_extra', 'exo_train_extra']:
    files = ego4d[split]['clips']
    all_videos.extend([f['video_file_name'] for f in files])

In [9]:
all_videos = set(all_videos)
with open("/newdata/tarun/datasets/ego4d/video_files.txt", "w") as fh:
    all_videos = [v for v in all_videos if "gp04" in v]
    fh.write("\n".join(all_videos))

In [5]:
len(set(all_videos))

951

In [None]:
id_to_annotation = {seg["segment_id"]:seg["class_name"] for seg in ego4d["ego_train"]["annotations"]}

In [None]:
id_to_text = {seg["segment_id"]:seg["text_caption"] for seg in ego4d["ego_train"]["descriptions"]}

In [None]:
all_lens = []
for split in ['ego_train', 'ego_val', 'exo_train', 'exo_val', 'ego_train_extra', 'exo_train_extra']:
    files = ego4d[split]['clips']
#     print(len(files))
    for f in files:
        
#         feat = torch.load(os.path.join("/newdata/tarun/datasets/ego4d/", f['feature_file_name']))
#         assert len(f['feature_indices']) >= 1
#         assert len(f['frame_indices']) >= 1
        index_tensor = torch.tensor(f['feature_indices'], dtype=torch.long)
        all_lens.append(len(index_tensor))
        if len(index_tensor)  <= 4:
            print(index_tensor)

In [None]:
len(all_lens)

In [None]:
sum(torch.Tensor(all_lens) <= 8)

## Add metadata

In [12]:
import json

In [13]:
ego4d = json.load(open("../metadata/ego4d_cooking.json"))

In [14]:
ego4d.keys()

dict_keys(['categories', 'ego_train', 'ego_val', 'ego_train_extra', 'exo_train', 'exo_val', 'exo_train_extra'])

In [15]:
ego4d['ego_train'].keys(), ego4d['ego_train_extra'].keys()

(dict_keys(['clips', 'annotations', 'metadata', 'descriptions']),
 dict_keys(['clips', 'annotations', 'metadata', 'descriptions']))

In [16]:
for subkeys in ['clips', 'annotations', 'metadata', 'descriptions']:
    ego4d['ego_train'][subkeys] += ego4d['ego_train_extra'][subkeys]
    ego4d['exo_train'][subkeys] += ego4d['exo_train_extra'][subkeys]
ego4d.pop("ego_train_extra")
ego4d.pop("exo_train_extra");

In [17]:
ego4d.keys()

dict_keys(['categories', 'ego_train', 'ego_val', 'exo_train', 'exo_val'])

In [21]:
for subkeys in ['clips', 'annotations', 'metadata', 'descriptions']:
    print(len(ego4d['ego_val'][subkeys]))

3147
3147
3147
3147


In [23]:
with open("../metadata/EgoExoDA.json", "w") as fh:
    json.dump(ego4d, fh, indent=4)