# Synset / object mapping validation

## Loading data

### Load synset tree

In [1]:
import json, csv
from nltk.corpus import wordnet as wn
import networkx as nx

# Build the legit-synset graph
G = nx.DiGraph()
G.add_nodes_from(x.name() for x in wn.all_synsets())
for parent in wn.all_synsets():
    for child in parent.hyponyms():
        G.add_edge(parent.name(), child.name())
        
# Add the illegit-synset graph
with open(r"D:\ig_pipeline\metadata\custom_synsets.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        child = row["custom_synset"].strip()
        parent = wn.synset(row["hypernyms"].strip()).name()
        assert parent in G.nodes, "Could not find " + parent
        G.add_edge(parent, child)
        
legit_synsets = set(G.nodes)

In [2]:
def is_leaf_synset(x):
    return G.out_degree[x] == 0

In [3]:
def canonicalize(s):
    try:
        return wn.synset(s).name()
    except:
        return s

### Load list of task-required synsets

In [4]:
with open(r"D:\ig_pipeline\metadata\b200_objects.json", "r") as f:
    obj_mapping = json.load(f)
activities = set(obj_mapping.keys())
activities_list = sorted(activities)
task_required_synsets_by_activity = {k: [canonicalize(x) for x in obj_mapping[k] if x != "agent.n.01"] for k in activities}
task_required_synsets = {x for objs in task_required_synsets_by_activity.values() for x in objs}
synset_requiring_tasks = {s: [t for t, ss in task_required_synsets_by_activity.items() if s in ss] for s in task_required_synsets}

### Load list of categories matching each synset

In [5]:
import csv
from collections import defaultdict
# Get the category - synset mapping
pairs = {}
synset_to_cat = defaultdict(list)
with open(r"D:\ig_pipeline\metadata\category_mapping.csv", newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        category = row["category"].strip()
        synset = row["synset"].strip()
        if not synset or not category:
            print(f"Skipping problematic row: {row}")
            continue
        canonical_synset = canonicalize(synset)
        if canonical_synset != synset:
            print(f"Non-canonical synset {synset} in category mapping replaced with canonical synset {canonical_synset}")
        synset = canonical_synset
        pairs[category] = synset
        synset_to_cat[synset].append(category)
found_synsets = set(pairs.values())
found_categories = set(pairs.keys())

Non-canonical synset smoke_alarm.n.01 in category mapping replaced with canonical synset fire_alarm.n.02
Non-canonical synset skeletal_frame.n.01 in category mapping replaced with canonical synset skeleton.n.04
Non-canonical synset ottoman.n.04 in category mapping replaced with canonical synset footstool.n.01
Non-canonical synset popsicle.n.01 in category mapping replaced with canonical synset ice_lolly.n.01
Non-canonical synset freezer.n.01 in category mapping replaced with canonical synset deep-freeze.n.01
Non-canonical synset creamer.n.01 in category mapping replaced with canonical synset cream_pitcher.n.01
Non-canonical synset soymilk.n.01 in category mapping replaced with canonical synset soya_milk.n.01
Non-canonical synset free_weight.n.01 in category mapping replaced with canonical synset weight.n.02
Non-canonical synset checkout_counter.n.01 in category mapping replaced with canonical synset checkout.n.03
Non-canonical synset stem.n.02 in category mapping replaced with canonica

In [6]:
def is_synset_available(s):
    child_synsets = [s] + list(nx.descendants(G, s))
    for cs in child_synsets:
        if cs in found_synsets:
            return True
        
    return False

## Analysis

### Problem 1: Task-required synsets that are not included in the WN hiearchy

In [7]:
illegal_task_required_synsets = {x for x in task_required_synsets - legit_synsets}
legit_task_required_synsets = task_required_synsets - illegal_task_required_synsets
print(f"{len(illegal_task_required_synsets)}/{len(task_required_synsets)} task required synsets missing in hierarchy.\n")
print(illegal_task_required_synsets)

185/1300 task required synsets missing in hierarchy.

{'vitamin_pill__container.n.01', 'electrical_refrigerator.n.01', 'brown_sugar__sack.n.01', 'half__potato.n.01', 'brown_rice__sack.n.01', 'cooked__white_rice.n.01', 'feta__box.n.01', 'diced__garlic.n.02', '?', 'granulated__sugar.n.01', 'pottable__marigold.n.01', 'whiskey__bottle.n.01', 'ink__bottle.n.01', 'shampoo__container.n.01', 'cleansing_agent__container.n.01', 'bleaching_agent__bottle.n.01', 'cooked__pasta.n.02', 'diced__chili.n.02', 'iced', 'liquid_soap.n.01_1', 'diced__beet.n.01', 'broken__glass.n.02', 'rubbing_alcohol__bottle.n.01', 'paprika__shaker.n.02', 'cooked__quinoa.n.01', 'quinoa__container.n.01', 'glaze__bottle.n.01', 'rosemary__shaker.n.01', 'half__log.n.01', 'diced__cheese.n.01', 'pasta__box.n.02', 'margarine__box.n.01', 'milk__carton.n.01', 'diced__tomato.n.01', 'water__sodium_carbonate.n.01', 'acetone__container.n.01', 'diced__bratwurst.n.01', 'chia_seed__bag.n.01', 'chicken_broth__carton.n.01', 'melted__white_ch

### Problem 2: Category-mapped synsets that are not included in the WN hiearchy

In [8]:
found_invalid_synsets = {x for x in found_synsets if x not in legit_synsets and canonicalize(x) not in legit_synsets}
print(f"{len(found_invalid_synsets)}/{len(found_synsets)} category-mapped synsets are illegal.")
print("\n".join(found_invalid_synsets))

0/1152 category-mapped synsets are illegal.



### Problem 3: Categories are mapped to non-leaf synsets

In [16]:
nonleaf_cats = {cat for cat, s in pairs.items() if not is_leaf_synset(s) and any(cs in found_synsets for cs in nx.descendants(G, s))}
leaf_cats = found_categories - nonleaf_cats
print(f"{len(nonleaf_cats)} / {len(pairs)} categories mapped to non-leaf synsets.\n")
print("\n\n".join(f"{cat}: {pairs[cat]}. Descendants: {sorted(x for x in nx.descendants(G, pairs[cat]) if x in found_synsets)}" for cat in sorted(nonleaf_cats)))

272 / 1426 categories mapped to non-leaf synsets.

adhesive: adhesive_material.n.01. Descendants: ['glue.n.01']

alcohol: alcohol.n.01. Descendants: ['beer.n.01', 'brandy.n.01', 'champagne.n.01', 'fruit_punch.n.01', 'gin.n.01', 'martini.n.01', 'red_wine.n.01', 'rum.n.01', 'sake.n.02', 'tequila.n.01', 'vodka.n.01', 'whiskey.n.01', 'wine.n.01']

almond_milk: milk.n.01. Descendants: ['buttermilk.n.01', 'chocolate_milk.n.01']

baby_bottle: bottle.n.01. Descendants: ['carafe.n.01', 'carboy.n.01', 'cruet.n.01', 'erlenmeyer_flask.n.01', 'flask.n.01', 'lemon_juice__bottle.n.01', 'pill_bottle.n.01', 'pop_bottle.n.01', 'specimen_bottle.n.01', 'water_bottle.n.01', 'whiskey_bottle.n.01', 'wine_bottle.n.01']

baby_monitor: detector.n.01. Descendants: ['photoelectric_cell.n.01']

bag_of_bread: bread.n.01. Descendants: ['bagel.n.01', 'baguet.n.01', 'biscuit.n.01', 'bun.n.01', 'cracker.n.01', 'crescent_roll.n.01', 'crouton.n.01', 'danish.n.02', 'meat_loaf.n.01', 'muffin.n.01', 'pretzel.n.01', 'scone.n

### Problem 4: Task-required synsets that don't have any categories mapping to them?
Caveat: substances are included too

In [10]:
# How many of the required synsets exist:
from collections import defaultdict
found_task_required_synsets = set()
for s in legit_task_required_synsets:
    if is_synset_available(s):
        found_task_required_synsets.add(s)  # Only add the sought-after synset, not children

not_found_task_required_synsets = legit_task_required_synsets - found_task_required_synsets
print(f"{len(not_found_task_required_synsets)}/{len(legit_task_required_synsets)} legitimate task-required synsets don't have corresponding category entries.\n")
print("\n".join(f"{s}: [{', '.join(act for act in activities_list if s in task_required_synsets_by_activity[act])}]" for s in not_found_task_required_synsets))

282/1115 legitimate task-required synsets don't have corresponding category entries.

chalice.n.01: [cleaning_cups_in_living_room, cleaning_glasses_off_bar, pour_a_glass_of_wine, prepare_a_slow_dinner_party]
gravy.n.01: [putting_roast_in_oven, serving_food_at_a_homeless_shelter]
yam.n.03: [make_yams]
ramen.n.01: [cook_ramen_noodles]
stain.n.01: [bringing_laundry, clean_a_baking_stone, clean_a_broiler_pan, clean_a_chicken_coop, clean_a_coffee_maker, clean_a_dirty_tent, clean_a_faucet, clean_a_fence, clean_a_flat_panel_monitor, clean_a_garden_sprayer, clean_a_glass_pipe, clean_a_glass_windshield, clean_a_grill_pan, clean_a_hamper, clean_a_keyboard, clean_a_kitchen_sink, clean_a_knife, clean_a_loofah_or_natural_sponge, clean_a_mattress, clean_a_pickup_truck, clean_a_pizza_stone, clean_a_popcorn_machine, clean_a_purse, clean_a_quilt, clean_a_raincoat, clean_a_sauna, clean_a_sauna_suit, clean_a_shower, clean_a_sponge, clean_a_stainless_steel_dishwasher, clean_a_teddy_bear, clean_a_tie, clea

### Problem 5: Task-required synsets map to objects from their descendants too (might have unexpected examples)

In [11]:
descendant_mapper_count = sum(1 for s in found_task_required_synsets if set(synset_to_cat[s]) != {cat for cs in set(nx.descendants(G, s)) | {s} for cat in synset_to_cat[cs]})
print(f"{descendant_mapper_count} / {len(found_task_required_synsets)} map to objects from descendants.")

for s in found_task_required_synsets:
    own_cats = set(synset_to_cat[s])
    subtree = set(nx.descendants(G, s)) | {s}
    desc_cats = {cat for cs in subtree for cat in synset_to_cat[cs]}
    if own_cats == desc_cats:
        continue
        
    print(f"\nSynset {s} gets the below objects:")
    for cs in sorted(subtree):
        if not synset_to_cat[cs]:
            continue
        print(f"From {cs}:", ", ".join(synset_to_cat[cs]))

233 / 833 map to objects from descendants.

Synset electric_lamp.n.01 gets the below objects:
From flashlight.n.01: flashlight
From light_bulb.n.01: light_bulb, bulb

Synset screw.n.04 gets the below objects:
From bolt.n.06: bolt
From screw.n.04: screw

Synset candy.n.01 gets the below objects:
From candy.n.01: candy_box, candy
From candy_cane.n.01: candy_cane
From easter_egg.n.01: easter_egg
From jelly_bean.n.01: jelly_bean
From lollipop.n.02: lollipop
From marshmallow.n.01: marshmallow

Synset cereal.n.03 gets the below objects:
From cereal.n.03: cereal
From corn_flake.n.01: box_of_corn_flake
From granola.n.01: granola_pile

Synset boot.n.01 gets the below objects:
From boot.n.01: boot
From buskin.n.01: hiking_boots
From rubber_boot.n.01: rainboot

Synset wood.n.01 gets the below objects:
From bamboo.n.01: bamboo
From log.n.01: tree_log, log
From wood.n.01: wood

Synset container.n.01 gets the below objects:
From ashcan.n.01: trash_can
From ashtray.n.01: ashtray
From atomizer.n.01: a