In [1]:
from collections import Counter
import json

In [2]:
datasets = ["wsj", "risec", "japflow", "chemu", "mscorpus"]

In [4]:
semantics_dict = {}

for dataset in datasets:
    vn_classes = []
    vn_roles = []
    modifiers = []
    labels = []

    with open(f"/projects/flow_graphs/data/{dataset}/{dataset}_semantics.json") as f:
        dataset_sentences = json.load(f)

    for sentence in dataset_sentences:
        if sentence["annotations"] is None:
            continue
        for prop in sentence["annotations"]["props"]:
            vn_classes.append(prop["sense"])
            for span in prop["spans"]:
                if span["modifier"]:
                    modifiers.append(span["label"])
                elif span["predicate"]:
                    continue
                else:
                    vn_roles.append(span["vn"])
                    labels.append(span["label"])

    semantics_dict[dataset] = {
        "vn_classes": vn_classes,
        "vn_roles": vn_roles,
        "labels": labels,
        "n_sentences": len(dataset_sentences),
    }

In [5]:
stats_dict = {}
for dataset, dataset_dict in semantics_dict.items():
    stats_dict[dataset] = {
        k: Counter(v) for k, v in dataset_dict.items() if k != "n_sentences"
    }
    stats_dict[dataset]["n_sentences"] = dataset_dict["n_sentences"]

In [9]:
stats_dict

{'wsj': {'vn_classes': Counter({'mix-22.1-2-1': 221,
           'seem-109-1-1': 11523,
           'dub-29.3.2': 868,
           'use-105.1': 764,
           'build-26.1-1': 846,
           'engender-27.1-1': 276,
           'disappearance-48.2': 110,
           'indicate-78': 173,
           'say-37.7-1-1': 8676,
           'escape-51.1-1-2': 91,
           'reflexive_appearance-48.1.2': 442,
           'indicate-78-1-1': 514,
           'stop-55.4-1-1': 621,
           'discover-84': 522,
           'seem-109-1-1-1': 508,
           'bring-11.3-1': 233,
           'talk-37.5': 322,
           'see-30.1-1-1-1': 93,
           'own-100.1': 2209,
           'learn-14-1': 289,
           'accompany-51.7': 261,
           'substitute-13.6.2-1': 103,
           'give-13.1-1': 1587,
           'cooperate-73.1-3': 299,
           'admit-64.3-1': 736,
           'admire-31.2': 658,
           'battle-36.4': 623,
           'get-13.5.1': 2339,
           'characterize-29.2': 328,
           'fu

In [10]:
stats_dict["wsj"]["n_sentences"], stats_dict["wsj"]["vn_roles"].most_common(10)

(37015,
 [('Theme', 59135),
  ('Agent', 48654),
  ('', 17984),
  ('Patient', 14213),
  ('Topic', 12440),
  ('Attribute', 12178),
  ('Experiencer', 5181),
  ('Stimulus', 5065),
  ('Pivot', 4277),
  ('Result', 3137)])

In [11]:
stats_dict["risec"]["n_sentences"], stats_dict["risec"]["vn_roles"].most_common(10)

(1189,
 [('Theme', 424),
  ('Patient', 367),
  ('Destination', 216),
  ('', 106),
  ('Attribute', 46),
  ('Initial Location', 43),
  ('Experiencer', 31),
  ('Result', 28),
  ('Material', 28),
  ('Agent', 24)])