In [26]:
import json
from collections import defaultdict, Counter
import random
import matplotlib.pyplot as plt
import numpy as np

In [27]:
import hashlib

def get_hash(input_string, size=8):
    # Generate the hash and truncate it to 8 characters
    hash_hex = hashlib.md5(input_string.encode()).hexdigest()[:size]
    # Convert the hexadecimal string to an integer
    return int(hash_hex, 16)

In [28]:
with open("cooking_keystep_atomic_common_takes.txt") as fh:
    takes = list(map(lambda v:v.strip(), fh.readlines()))

In [4]:
len(takes)

125

In [9]:
write_str = ""
for t in takes:
    write_str += "egoexo -o /newdata/tarun/datasets/ego4d --benchmarks keystep --parts takes --uids {} --yes\n".format(t)
with open("takes_download.sh", "w") as fh:
    fh.write(write_str)

## Extract text for all segments

In [17]:
## arrange the keysteps into <take_uid:list(segments)>

keysteps_json = json.load(open("annotations/keystep_train.json"))['annotations']

keystep_segments = {}
for k in takes:
    assert k not in keystep_segments
    keystep_segments[k] = keysteps_json[k]['segments']

In [18]:
## arrange the atomic descriptions into <take_uid:dict(annotator_id:list(desc))>

atomic_json = json.load(open("annotations/atomic_descriptions_train.json"))['annotations']

atomic_annotations = {}
for k in takes:
    annotator_dict = {}
    for annotations in atomic_json[k]:
        aid = annotations['annotator_id']
        desc = annotations['descriptions']
        annotator_dict[aid] = desc
    atomic_annotations[k] = annotator_dict

### Match the keysteps to atomic annotations takewise. for each annotator from each take, do the following:
1. give the segment a unique id: this is a hash of the take and the segment_id within the take.
2. identify the start and end times of the segment.
3. from the atomic annotations, find all the instances which falls between this start and end time.
4. concatenate these statements end to end - this forms the textual description of the keystep per annotator.
5. if there is no atomic action description between the start and end times of a keystep from any of the annotator, then the textual description of that keystep would be the concatenation of the action descriptions from closest before and after action descriptions from any of the randomly sampled annotator. 
6. note that a single atomic descrption can be matched to multiple keysteps - since the keysteps segments can be overlapping. But this happens rarely, like 75/4262 times or lesser.
7. To fuse texts from different annotators, we just concatenate them end to end - but there should be better ways of doing this. 

In [19]:
segment_to_text = defaultdict(list) ## dict with list values
segment_to_meta = dict()
segment_to_annotation = dict()
take_uid = takes[0] 

for take_uid in takes:
    segment_list = keystep_segments[take_uid] ## list
    annotator_dict = atomic_annotations[take_uid] ## dict: ann_id:list of descriptions.

    for idx, segment in enumerate(segment_list):
        segment_hash = str(get_hash(str(take_uid) + str(idx)))
        segment_to_annotation[segment_hash] = segment
        start_time = segment['start_time']
        end_time = segment['end_time']
        
        text_dict = dict()
        best_exo = []
        for aid in annotator_dict:
            ann_desc = annotator_dict[aid]
            ann_text = []
            
            for desc in ann_desc:
                if start_time <= desc['timestamp'] <= end_time:
                    ## this atomic description falls inside the segment - so should correspond to the action.
                    ann_text.append(desc['text'][2:]) ## Ignore the subject ID.
                    best_exo.append(desc['best_exo'])
            if len(ann_text):
                ann_text = " ".join(ann_text)
                text_dict[aid] = ann_text
        
        len_text = sum([len(v) for v in text_dict.values()])
        if len_text == 0:
            aid = random.sample(list(annotator_dict.keys()),1)[0]
            ann_desc = annotator_dict[aid]
            ann_text = []
            for jdx in range(len(ann_desc)):
                if jdx==(len(ann_desc)-1) or (ann_desc[jdx]['timestamp'] < start_time and ann_desc[jdx+1]['timestamp'] > start_time):
                    ann_text.append(ann_desc[jdx]['text'][2:])
                    best_exo.append(ann_desc[jdx]['best_exo'])
                    break
            if len(ann_desc) and (ann_desc[0]['timestamp'] > end_time):
                    ann_text.append(ann_desc[0]['text'][2:])
                    best_exo.append(ann_desc[0]['best_exo'])
            else:
                for jdx in range(1,len(ann_desc)):
                    if (ann_desc[jdx]['timestamp'] > end_time and ann_desc[jdx-1]['timestamp'] < end_time):
                        ann_text.append(ann_desc[jdx]['text'][2:])
                        best_exo.append(ann_desc[jdx]['best_exo'])
                        break
            
            if len(ann_text):
                text_dict[aid] = " ".join(ann_text)
                            
        
        segment_to_text[segment_hash] = text_dict 
        segment.update({
            'take_uid'  : take_uid,
            'segment_id': idx,
            'best_exo' : random.sample(best_exo,1)[0],# if len(best_exo) >= 1 else {}
#             'domain'   : take_uid_to_name[take_uid]
        })
        
        segment_to_meta[segment_hash] = segment

In [None]:
with open("cooking_keystep_text.json", "w") as fh:
    json.dump({'text':segment_to_text, 'metadata':segment_to_meta}, fh, indent=4)

## Extract labels at L1 hierarchy for all the segments

In [None]:
keysteps = json.load(open("annotations/keystep_train.json"))
annotations = keysteps['annotations']
taxonomy = keysteps['taxonomy']

In [None]:
cooking = ['Making Coffee latte',
 'Making Cucumber & Tomato Salad',
 'Cooking Scrambled Eggs',
 'Cooking an Omelet',
 'Making Milk Tea',
 'Making Sesame-Ginger Asian Salad',
 'Cooking Tomato & Eggs',
 'Making Chai Tea',
 'Cooking Noodles',
 'Cooking Sushi Rolls',
 'Cooking Pasta']

In [None]:
scenario_wise_mapping = {}
for scenario in cooking:
    
    ## compute the parent tree.
    idx_to_parent = {}
    child_ids = []
    idx_to_name = {}
    for c,m in taxonomy[scenario].items():
        idx_to_name[m['id']] = m['name']
        idx_to_parent[m['id']] = m['parent_id']
        if m['is_leafnode']:
            child_ids.append(m['id'])
    
    ## map the child labels to the parents.
    mapping = {}
    for cid in child_ids:
        curr = cid
        while idx_to_parent[curr] != 0:
            curr = idx_to_parent[curr]
        mapping[cid] = idx_to_name[curr]
        
    scenario_wise_mapping[scenario] = mapping

In [None]:
segment_to_ann = {}
all_labels = []
for take_uid in takes:
    ann = annotations[take_uid]
    if ann["scenario"] in cooking:
        for idx, seg in enumerate(ann["segments"]):
            segment_hash = str(get_hash(str(take_uid) + str(idx)))
            label_remapped_name = scenario_wise_mapping[ann["scenario"]][seg['step_id']]
            all_labels.append(label_remapped_name)
#             label_remapped_id = label_to_idx[label_remapped_name]
            seg.update({
#                 'l1_label' : label_remapped_id,
                'l1_label_name' : label_remapped_name,
                'take_uid'      : take_uid,
                'segment_id'    : idx,
                'scenario'      : ann['scenario']
            })
            segment_to_ann[segment_hash] = seg
    else:
        assert 5==6

In [None]:
all_labels = list(sorted(set(all_labels)))



In [None]:
# for k,ann in segment_to_ann.items():
#     ann.update({
#         'l1_label' : label_to_idx[ann['l1_label_name']]
#     })

In [None]:
with open("cooking_keystep_annotation.json", "w") as fh:
    json.dump({'annotation':segment_to_ann}, fh, indent=4)

In [None]:
label_list = [c["l1_label_name"] for c in segment_to_ann.values()]

In [None]:
label_dict = Counter(label_list) #dict(sorted(Counter(label_list).items(), key=lambda v:v[1], reverse=True))

In [None]:
valid_labels = [l for l,v in label_dict.items() if v>=5]

In [None]:
len(set(valid_labels))

In [None]:
idx_to_label = {idx:l for idx,l in enumerate(valid_labels)}
label_to_idx = {v:k for k,v in idx_to_label.items()}

## Create the text training files.

In [None]:
text = json.load(open("cooking_keystep_text.json"))
seg_to_text = text['text']
seg_to_meta = text['metadata']
seg_to_ann = json.load(open("cooking_keystep_annotation.json"))['annotation']

In [None]:
json_data = {}

In [None]:
categories = [{"category_name":cname,  "category_id":idx} for idx, cname in idx_to_label.items()]
json_data['categories'] = categories

In [None]:
label_to_seg = defaultdict(list)
for seg_id, ann in seg_to_ann.items():
    if ann['l1_label_name'] in valid_labels:
        label_to_seg[ann['l1_label_name']].append(seg_id)

In [None]:
source_ids = []
target_ids = []
for segments in label_to_seg.values():
    source = random.sample(list(segments), int(len(segments)*0.55))
    target = [s for s in segments if s not in source]
    
    source_ids.extend(source)
    target_ids.extend(target)
seg_ids = {
    "source" : source_ids,
    "target" : target_ids
}

In [None]:
fid_to_filename = {k:"file.jpg" for k in seg_to_text}
fid_to_category = {k:v['l1_label_name'] for k,v in seg_to_ann.items()}
fid_to_label = {k:label_to_idx.get(v['l1_label_name'],-1) for k,v in seg_to_ann.items()}

In [None]:
for domain in ['source', 'target']:
    
        ## images
        all_ids = seg_ids[domain]

        clips = []

        for fid in all_ids:
            clips.append({
                "filename" : fid_to_filename[fid],
                "id"       : int(fid),
            })

        ## annotations

        anns = []

        for fid in all_ids:

            anns.append({
                "segment_id" : int(fid),
                "category" : fid_to_label[fid],
                'class_name' : fid_to_category[fid]
            })

        ## metadata

        meta = []

        for fid in all_ids:
            meta_dict = seg_to_meta[fid]
            fid = int(fid)
            

            meta.append({
                'segment_id'    : fid,
                'start_time'  : meta_dict['start_time'],
                'end_time'    : meta_dict['end_time'],
                'take_uid'    : meta_dict['take_uid'],
                'segment_index' : meta_dict['segment_id'],
                'best_exo'    : meta_dict['best_exo']
            })
            
        ## text description
        
        text_desc = []
        
        for fid in all_ids:
            segment_id = int(fid)
            final_text = ""
            for text in seg_to_text[fid].values():
                final_text += text
            text_dict = {
                'segment_id' : segment_id,
                'text_caption' : final_text,
                'annotator_texts' : {}
            }
            for ann_id, text in seg_to_text[fid].items():
                text_dict['annotator_texts'][ann_id] = text
            
            text_desc.append(text_dict)

        json_data["{}".format(domain)] = {
            "clips" : clips,
            "annotations" : anns,
            "metadata"    : meta,
            "descriptions" : text_desc
        }

In [None]:
## 22 categories from source and target
## ~2000 segment clips from each.
with open("ego4d_cooking.json", "w") as fh:
    json.dump(json_data, fh, indent=4)

## create text training files

In [17]:
ego4d = json.load(open("ego4d_cooking.json"))

In [18]:
for domain in ['source', 'target']:
    with open("/home/tarun/LangBasedGeoDA/data/ego4d/{}_train_split.txt".format(domain), "w") as fh:
        write_str = ""
        for text,ann in zip(ego4d[domain]['descriptions'], ego4d[domain]['annotations']):
            for ann_text in text['annotator_texts'].values():
                write_str += "{};{};{};{}\n".format(text['segment_id'], ann_text.replace("\n",". ").replace(";"," "), ann['class_name'], ann['category'])
        fh.write(write_str)

In [23]:
len(ego4d['source']['annotations'])

2301

In [24]:
lens = [len(a['annotator_texts']) for a in ego4d['source']['descriptions']]

In [25]:
Counter(lens)

Counter({2: 1258, 1: 1043})

In [22]:
1107*2+793

3007