# Synset / object mapping validation

## Loading data

### Load synset tree

In [1]:
import json, csv
from nltk.corpus import wordnet as wn
import networkx as nx

# Build the legit-synset graph
G = nx.DiGraph()
G.add_nodes_from(x.name() for x in wn.all_synsets())
for parent in wn.all_synsets():
    for child in parent.hyponyms():
        G.add_edge(parent.name(), child.name())
        
# Add the illegit-synset graph
with open(r"D:\ig_pipeline\metadata\custom_synsets.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        child = row["custom_synset"].strip()
        parent = wn.synset(row["hypernyms"].strip()).name()
        assert parent in G.nodes, "Could not find " + parent
        G.add_edge(parent, child)
        
legit_synsets = set(G.nodes)

In [2]:
def is_leaf_synset(x):
    return G.out_degree[x] == 0

In [3]:
def canonicalize(s):
    try:
        return wn.synset(s).name()
    except:
        return s

### Load list of categories matching each synset

In [4]:
import csv
from collections import defaultdict
# Get the category - synset mapping
pairs = {}
synset_to_cat = defaultdict(list)
with open(r"D:\ig_pipeline\metadata\category_mapping.csv", newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        category = row["category"].strip()
        synset = row["synset"].strip()
        if not synset or not category:
            print(f"Skipping problematic row: {row}")
            continue
        canonical_synset = canonicalize(synset)
        if canonical_synset != synset:
            print(f"Non-canonical synset {synset} in category mapping replaced with canonical synset {canonical_synset}")
        synset = canonical_synset
        pairs[category] = synset
        synset_to_cat[synset].append(category)
found_synsets = set(pairs.values())
found_categories = set(pairs.keys())

Non-canonical synset smoke_alarm.n.01 in category mapping replaced with canonical synset fire_alarm.n.02
Non-canonical synset skeletal_frame.n.01 in category mapping replaced with canonical synset skeleton.n.04
Non-canonical synset ottoman.n.04 in category mapping replaced with canonical synset footstool.n.01
Non-canonical synset popsicle.n.01 in category mapping replaced with canonical synset ice_lolly.n.01
Non-canonical synset freezer.n.01 in category mapping replaced with canonical synset deep-freeze.n.01
Non-canonical synset creamer.n.01 in category mapping replaced with canonical synset cream_pitcher.n.01
Non-canonical synset soymilk.n.01 in category mapping replaced with canonical synset soya_milk.n.01
Non-canonical synset free_weight.n.01 in category mapping replaced with canonical synset weight.n.02
Non-canonical synset checkout_counter.n.01 in category mapping replaced with canonical synset checkout.n.03
Non-canonical synset stem.n.02 in category mapping replaced with canonica

In [6]:
def get_synset_categories(s):
    child_synsets = [s] + list(nx.descendants(G, s))
    for cs in child_synsets:
        if cs in found_synsets:
            yield from synset_to_cat[cs]

### Load providers too

In [12]:
# Are there any categories that are missing from the table?
import pathlib, glob, json, os
from collections import defaultdict
providers = {}
objects_by_category = defaultdict(list)
object_lists = glob.glob(r"D:\ig_pipeline\cad\*\*\artifacts\object_list.json")
for olf in object_lists:
    dirname = pathlib.Path(olf).parts[-4] + "/" + pathlib.Path(olf).parts[-3]
    with open(olf, "r") as f:
        ol = json.load(f)
    for obj in ol["provided_objects"]:
        providers[obj] = dirname
        objects_by_category[obj.split("-")[0]].append(obj)

In [14]:
def query_synset(s):
    print(f"\nFor synset {s}:")
    for cat in get_synset_categories(s):
        print(f"  Matched category: {cat}")
        for obj in objects_by_category[cat]:
            print(f"  Matched object: {obj}, provided by {providers[obj]}")

## Now you can query

In [15]:
query_synset("mailbox.n.01")


For synset mailbox.n.01:
  Matched category: mailbox
  Matched object: mailbox-bktljr, provided by objects/mail-rt


In [16]:
query_synset("package.n.02")


For synset package.n.02:
  Matched category: package
  Matched object: package-msfzpz, provided by objects/legacy_package-msfzpz
  Matched object: package-sxlklf, provided by objects/legacy_package-sxlklf
