# Generating Synthetic Data

In [1]:
import pandas as pd
from os.path import join

In [2]:
DATA_FOLDER = 'resource/perception_object_states'
OBJECTS = {'pinwheels': ['tortilla', 'plate', 'knife'], 'oatmeal': ['bowl']}

In [3]:
raw_annotations_path = join(DATA_FOLDER, 'raw_annotations.csv')
annotations = pd.read_csv(raw_annotations_path)
annotations.head(10)

Unnamed: 0,recipe,video_name,time,step,bowl,microwave,tortilla,plate,knife,Object exhaustive
0,pinwheels,pinwheels_2023.03.01-21.35.14,0:00,place tortilla,,,in-package,empty,,
1,pinwheels,pinwheels_2023.03.01-21.35.14,0:15,,,,plain,,,
2,pinwheels,pinwheels_2023.03.01-21.35.14,0:52,spread PB,,,,,,
3,pinwheels,pinwheels_2023.03.01-21.35.14,1:07,,,,peanut-butter[partial],,,
4,pinwheels,pinwheels_2023.03.01-21.35.14,1:27,,,,peanut-butter[full],,,
5,pinwheels,pinwheels_2023.03.01-21.35.14,1:42,clean knife,,,,,,
6,pinwheels,pinwheels_2023.03.01-21.35.14,3:43,spread J,,,,,,
7,pinwheels,pinwheels_2023.03.01-21.35.14,,,,,pb+jelly[partial],,,
8,pinwheels,pinwheels_2023.03.01-21.35.14,,,,,pb+jelly[full],,,
9,pinwheels,pinwheels_2023.03.01-21.35.14,4:13,clean knife,,,,,,


### Cleaning Annotated Data

In [4]:
import copy

def to_seconds(minutes_seconds):
    minutes, seconds =  map(int, minutes_seconds.split(':'))
    only_seconds = minutes * 60 + seconds
    
    return only_seconds

def format_annotations(annotations, video_id):
    video_annotations = copy.deepcopy(annotations[annotations['video_name'] == video_id])
    video_annotations = video_annotations[pd.notnull(video_annotations['time'])]
    video_annotations['start_time'] = video_annotations['time'].apply(to_seconds)
    video_annotations['end_time'] = video_annotations['start_time'].shift(-1).astype('Int64', errors='ignore').sub(1)
    recipe_id = video_annotations.iloc[0]['recipe']  
    step_id = 0
    tracked_objects = OBJECTS[recipe_id]
    current_states = {o: None for o in tracked_objects}
    formatted_annotations = {'recipe': [], 'video': [], 'start_time': [], 'end_time': [], 'step': []}
    formatted_annotations.update({o: [] for o in tracked_objects})
    
    for _, row in video_annotations.iterrows():
        current_step = row['step']
        if not pd.isna(current_step):
            step_id += 1
        
        for tracked_object in tracked_objects:
            current_state = row[tracked_object]
            if not pd.isna(current_state):
                current_states[tracked_object] = current_state
            formatted_annotations[tracked_object].append(current_states[tracked_object])
        formatted_annotations['recipe'].append(row['recipe'])
        formatted_annotations['video'].append(row['video_name'])
        formatted_annotations['start_time'].append(row['start_time'])
        formatted_annotations['end_time'].append(row['end_time'])
        formatted_annotations['step'].append(step_id)
    
    formatted_annotations['end_time'][-1] = formatted_annotations['start_time'][-1] + 3 # Last element doesn't have and end time, just add 3 secs
    formatted_annotations_df = pd.DataFrame.from_dict(formatted_annotations)
  
    return formatted_annotations_df

In [5]:
video_id = 'pinwheels_2023.04.04-18.33.59'
formatted_annotations_p = format_annotations(annotations, video_id)
formatted_annotations_p

Unnamed: 0,recipe,video,start_time,end_time,step,tortilla,plate,knife
0,pinwheels,pinwheels_2023.04.04-18.33.59,0,5,1,in-package,empty,clean
1,pinwheels,pinwheels_2023.04.04-18.33.59,6,17,1,plain,empty,clean
2,pinwheels,pinwheels_2023.04.04-18.33.59,18,23,2,plain,empty,clean
3,pinwheels,pinwheels_2023.04.04-18.33.59,24,26,2,plain,empty,dirty
4,pinwheels,pinwheels_2023.04.04-18.33.59,27,40,2,peanut-butter[partial],empty,dirty
5,pinwheels,pinwheels_2023.04.04-18.33.59,41,49,2,peanut-butter[full],empty,dirty
6,pinwheels,pinwheels_2023.04.04-18.33.59,50,60,3,peanut-butter[full],empty,dirty
7,pinwheels,pinwheels_2023.04.04-18.33.59,61,64,3,peanut-butter[full],empty,clean
8,pinwheels,pinwheels_2023.04.04-18.33.59,65,70,4,peanut-butter[full],empty,clean
9,pinwheels,pinwheels_2023.04.04-18.33.59,71,72,4,peanut-butter[full],empty,dirty


### Generating Perception Outputs

In [6]:
import json
from datetime import datetime

CURRENT_TIME = int(datetime.now().timestamp())
PERCEPTION_OUTPUT_TEMPLATE = {
        "pos": [-0.2149151724097291, -0.4343880843796524, -0.6208099189217009],
        "xyxyn": [0.1, 0.1, 0.2, 0.2],
        "label": "",
        "status": "tracked",
        "id": "1",
        "last_seen":"",
        "state": {},
        "hand_object_interaction": 0.27,
    }
MAPPING_IDS = {'tortilla': 0, 'knife': 1, 'plate': 2, 'bowl':3}


def select_tracked_objects(row, objects):
    tracked_objects = {}
    
    for obj in objects:
        if row[obj] != '':
            tracked_objects[obj] = row[obj]
    
    return tracked_objects


def get_unique_states(annotations, objects):
    unique_states = {}
    
    for obj in objects:
        states = annotations[obj].unique()
        unique_states[obj] = list(states)
    
    return unique_states


def read_annotated_video(video_annotations):
    recipe_id = video_annotations.iloc[0]['recipe']
    unique_states = get_unique_states(video_annotations, OBJECTS[recipe_id])    
    annotated_video = {'task_id': recipe_id, 'session_id': video_id, 'records': {}, 'unique_states': unique_states}
    
    for _, row in video_annotations.iterrows():
        tracked_objects = select_tracked_objects(row, OBJECTS[recipe_id])
        step_id = row['step']
        if step_id not in annotated_video['records']:
            annotated_video['records'][step_id] = []
        annotated_video['records'][step_id].append({'start_time': row['start_time'], 'end_time': row['end_time'], 'objects': tracked_objects})
    
    return annotated_video


def make_perception_outputs(annotated_video):
    perception_outputs = []
        
    for step_id, step_annotations in annotated_video['records'].items():
        session_annotation = {'session_id': annotated_video['session_id'], 'task_id': annotated_video['task_id'], 'step_id': step_id}
        for step_annotation in step_annotations:
            step_outputs = make_step_outputs(session_annotation, step_annotation, annotated_video['unique_states'], PERCEPTION_OUTPUT_TEMPLATE)
            perception_outputs += step_outputs
    
    return perception_outputs


def make_step_outputs(session_annotation, step_annotation, unique_states, output_template, target_state_probas=None, target_object=None):
    objects = step_annotation['objects']
    start_time = step_annotation['start_time']
    end_time = step_annotation['end_time']
    step_outputs = []
    
    for time_secs in range(start_time, end_time+1):
        time_stamp = CURRENT_TIME + time_secs
        for object_name, object_state in objects.items():
            object_output = copy.deepcopy(output_template)
            object_output['id'] = MAPPING_IDS[object_name]
            object_output['session'] = session_annotation
            object_output['label'] = object_name
            object_output['last_seen'] = time_stamp
            state_probas = {s: 0.0 for s in unique_states[object_name]}
            state_probas[object_state] = 1.0
            object_output['state'] = state_probas
            
            if object_name == target_object:
                object_output['state'] = target_state_probas
                
            step_outputs.append(object_output)
            
    return step_outputs


def save_outputs(outputs, file_name):
    with open(join(DATA_FOLDER, f'{file_name}.json'), 'w') as fout:
        json.dump(outputs, fout, indent=2)
    print(json.dumps(outputs, indent=2))


In [7]:
pinwheels_annotations = read_annotated_video(formatted_annotations_p)
pinwheels_perception_outputs = make_perception_outputs(pinwheels_annotations)
save_outputs(pinwheels_perception_outputs, 'pinwheels_session')

[
  {
    "pos": [
      -0.2149151724097291,
      -0.4343880843796524,
      -0.6208099189217009
    ],
    "xyxyn": [
      0.1,
      0.1,
      0.2,
      0.2
    ],
    "label": "tortilla",
    "status": "tracked",
    "id": 0,
    "last_seen": 1697591895,
    "state": {
      "in-package": 1.0,
      "plain": 0.0,
      "peanut-butter[partial]": 0.0,
      "peanut-butter[full]": 0.0,
      "pb+jelly[partial]": 0.0,
      "pb+jelly[full]": 0.0,
      "rolling": 0.0,
      "rolled": 0.0,
      "rolled+toothpicks[partial]": 0.0,
      "rolled+toothpicks[full]": 0.0,
      "ends-cut[partial]": 0.0,
      "ends-cut[full]": 0.0,
      "floss-underneath": 0.0,
      "floss-crossed": 0.0,
      "sliced[partial]": 0.0,
      "sliced[full]": 0.0,
      "on-plate[partial]": 0.0,
      "on-plate[full]": 0.0
    },
    "hand_object_interaction": 0.27,
    "session": {
      "session_id": "pinwheels_2023.04.04-18.33.59",
      "task_id": "pinwheels",
      "step_id": 1
    }
  },
  {
    "pos

In [8]:
video_id = 'oatmeal_mit-eval'
formatted_annotations_o = format_annotations(annotations, video_id)
oatmeal_annotations = read_annotated_video(formatted_annotations_o)
oatmeal_perception_outputs = make_perception_outputs(oatmeal_annotations)
save_outputs(oatmeal_perception_outputs, 'oatmeal_session')

[
  {
    "pos": [
      -0.2149151724097291,
      -0.4343880843796524,
      -0.6208099189217009
    ],
    "xyxyn": [
      0.1,
      0.1,
      0.2,
      0.2
    ],
    "label": "bowl",
    "status": "tracked",
    "id": 3,
    "last_seen": 1697591895,
    "state": {
      "plain": 1.0,
      "oatmeal": 0.0,
      "oatmeal+water": 0.0,
      "oatmeal[cooked]": 0.0,
      "oatmeal[cooked]+raisins": 0.0,
      "oatmeal+raisins[cooked]": 0.0,
      "oatmeal+raisins[cooked]+banana": 0.0,
      "oatmeal+raisins[cooked]+banana+cinnamon": 0.0,
      "oatmeal+raisins[cooked]+banana+cinnamon+honey": 0.0
    },
    "hand_object_interaction": 0.27,
    "session": {
      "session_id": "oatmeal_mit-eval",
      "task_id": "oatmeal",
      "step_id": 1
    }
  },
  {
    "pos": [
      -0.2149151724097291,
      -0.4343880843796524,
      -0.6208099189217009
    ],
    "xyxyn": [
      0.1,
      0.1,
      0.2,
      0.2
    ],
    "label": "bowl",
    "status": "tracked",
    "id": 3,
    "

### Merging and Modifying Data

In [9]:
def simulate_state_probas(state_probas, unique_states):
    all_state_probas = {}
    
    if state_probas is None:
        state_probas = {}
    
    total_proba = 0

    for state_name, state_proba in state_probas.items():
        try:
            unique_states.remove(state_name)
        except:
            continue
        all_state_probas[state_name] = state_proba
        total_proba += state_proba

    remaining_proba = 1 - total_proba
    remaining_proba /= len(unique_states)

    for unique_state in unique_states:
        all_state_probas[unique_state] = remaining_proba
    
    return all_state_probas
        

def make_errors(annotated_video, target_step, target_object=None, target_state_probas=None):
    unique_states = copy.deepcopy(annotated_video['unique_states'][target_object])
    perception_outputs = []
    
    for step_id, step_annotations in annotated_video['records'].items():
        state_probas = None
        session_annotation = {'session_id': annotated_video['session_id'], 'task_id': annotated_video['task_id'], 'step_id': step_id}
        
        if target_step == step_id:
            state_probas = simulate_state_probas(target_state_probas, unique_states)
            
        for step_annotation in step_annotations:
            step_outputs = make_step_outputs(session_annotation, step_annotation, annotated_video['unique_states'], PERCEPTION_OUTPUT_TEMPLATE, state_probas, target_object)
            perception_outputs += step_outputs
    
    return perception_outputs


def group_by_step(session):
    session_by_step = {}
    
    for entry in session:
        step_id = entry['session']['step_id']
        if step_id not in session_by_step:
            session_by_step[step_id] = []
            
        session_by_step[step_id].append(entry)
    
    return list(session_by_step.values())


def merge_sessions(session1, session2, step_size=1):
    max_length = max(len(session1), len(session2))
    merged_sessions = []
    current_index = 0
    session1_by_step = group_by_step(session1)
    session2_by_step = group_by_step(session2)
    
    while current_index < max_length:
        selected_steps = session1_by_step[current_index: current_index+step_size]
        merged_sessions += selected_steps
        selected_steps = session2_by_step[current_index: current_index+step_size]
        merged_sessions += selected_steps
        current_index = current_index + step_size
    
    return merged_sessions

In [10]:
merged_sessions = merge_sessions(pinwheels_perception_outputs, oatmeal_perception_outputs, 2)
save_outputs(merged_sessions, 'merged_sessions')

[
  [
    {
      "pos": [
        -0.2149151724097291,
        -0.4343880843796524,
        -0.6208099189217009
      ],
      "xyxyn": [
        0.1,
        0.1,
        0.2,
        0.2
      ],
      "label": "tortilla",
      "status": "tracked",
      "id": 0,
      "last_seen": 1697591895,
      "state": {
        "in-package": 1.0,
        "plain": 0.0,
        "peanut-butter[partial]": 0.0,
        "peanut-butter[full]": 0.0,
        "pb+jelly[partial]": 0.0,
        "pb+jelly[full]": 0.0,
        "rolling": 0.0,
        "rolled": 0.0,
        "rolled+toothpicks[partial]": 0.0,
        "rolled+toothpicks[full]": 0.0,
        "ends-cut[partial]": 0.0,
        "ends-cut[full]": 0.0,
        "floss-underneath": 0.0,
        "floss-crossed": 0.0,
        "sliced[partial]": 0.0,
        "sliced[full]": 0.0,
        "on-plate[partial]": 0.0,
        "on-plate[full]": 0.0
      },
      "hand_object_interaction": 0.27,
      "session": {
        "session_id": "pinwheels_2023.04.04-1

In [11]:
errored_session = make_errors(pinwheels_annotations, 1, 'tortilla', {'rolled': 0.8})
save_outputs(errored_session, 'errored_session')

[
  {
    "pos": [
      -0.2149151724097291,
      -0.4343880843796524,
      -0.6208099189217009
    ],
    "xyxyn": [
      0.1,
      0.1,
      0.2,
      0.2
    ],
    "label": "tortilla",
    "status": "tracked",
    "id": 0,
    "last_seen": 1697591895,
    "state": {
      "rolled": 0.8,
      "in-package": 0.01176470588235294,
      "plain": 0.01176470588235294,
      "peanut-butter[partial]": 0.01176470588235294,
      "peanut-butter[full]": 0.01176470588235294,
      "pb+jelly[partial]": 0.01176470588235294,
      "pb+jelly[full]": 0.01176470588235294,
      "rolling": 0.01176470588235294,
      "rolled+toothpicks[partial]": 0.01176470588235294,
      "rolled+toothpicks[full]": 0.01176470588235294,
      "ends-cut[partial]": 0.01176470588235294,
      "ends-cut[full]": 0.01176470588235294,
      "floss-underneath": 0.01176470588235294,
      "floss-crossed": 0.01176470588235294,
      "sliced[partial]": 0.01176470588235294,
      "sliced[full]": 0.01176470588235294,
      