In [1]:
from collections import defaultdict
import json
from pprint import pprint

import dill
from tqdm.auto import tqdm

In [2]:
datasets = ["risec", "japflow", "chemu", "mscorpus"]
splits = ["train", "dev", "test"]

In [3]:
loaded_datasets = {}
for dataset in datasets:
    with open(f"/projects/flow_graphs/data/{dataset}/data_amr.dill", "rb") as f:
        data = dill.load(f)
        loaded_datasets[dataset] = data

In [4]:
amr_counts = {dataset: defaultdict(lambda: 0) for dataset in datasets}

for dataset in tqdm(datasets):
    data = loaded_datasets[dataset]
    for split in splits:
        for instance in tqdm(data[split]["rels"]):
            amr_data = instance["amr_data"]
            n1_missing = amr_data.n1_mask.sum().item() == 0
            n2_missing = amr_data.n2_mask.sum().item() == 0

            amr_counts[dataset]["instances"] += 1
            if n1_missing or n2_missing:
                amr_counts[dataset]["invalid"] += 1
            if n1_missing:
                amr_counts[dataset]["n1_missing"] += 1
            if n2_missing:
                amr_counts[dataset]["n2_missing"] += 1
            if n1_missing and n2_missing:
                amr_counts[dataset]["both_missing"] += 1

            
amr_counts

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3689 [00:00<?, ?it/s]

  0%|          | 0/1689 [00:00<?, ?it/s]

  0%|          | 0/2213 [00:00<?, ?it/s]

  0%|          | 0/13958 [00:00<?, ?it/s]

  0%|          | 0/1745 [00:00<?, ?it/s]

  0%|          | 0/1745 [00:00<?, ?it/s]

  0%|          | 0/11411 [00:00<?, ?it/s]

  0%|          | 0/2885 [00:00<?, ?it/s]

  0%|          | 0/3332 [00:00<?, ?it/s]

  0%|          | 0/12330 [00:00<?, ?it/s]

  0%|          | 0/2287 [00:00<?, ?it/s]

  0%|          | 0/3782 [00:00<?, ?it/s]

{'risec': defaultdict(<function __main__.<dictcomp>.<lambda>()>,
             {'instances': 7591,
              'invalid': 1751,
              'n2_missing': 1261,
              'n1_missing': 1207,
              'both_missing': 717}),
 'japflow': defaultdict(<function __main__.<dictcomp>.<lambda>()>,
             {'instances': 17448,
              'invalid': 1440,
              'n2_missing': 938,
              'n1_missing': 836,
              'both_missing': 334}),
 'chemu': defaultdict(<function __main__.<dictcomp>.<lambda>()>,
             {'instances': 17628,
              'invalid': 4884,
              'n2_missing': 4499,
              'n1_missing': 1383,
              'both_missing': 998}),
 'mscorpus': defaultdict(<function __main__.<dictcomp>.<lambda>()>,
             {'instances': 18399,
              'invalid': 8085,
              'n1_missing': 4754,
              'n2_missing': 6002,
              'both_missing': 2671})}

In [5]:
print(json.dumps(amr_counts, indent=4))

{
    "risec": {
        "instances": 7591,
        "invalid": 1751,
        "n2_missing": 1261,
        "n1_missing": 1207,
        "both_missing": 717
    },
    "japflow": {
        "instances": 17448,
        "invalid": 1440,
        "n2_missing": 938,
        "n1_missing": 836,
        "both_missing": 334
    },
    "chemu": {
        "instances": 17628,
        "invalid": 4884,
        "n2_missing": 4499,
        "n1_missing": 1383,
        "both_missing": 998
    },
    "mscorpus": {
        "instances": 18399,
        "invalid": 8085,
        "n1_missing": 4754,
        "n2_missing": 6002,
        "both_missing": 2671
    }
}
