In [1]:
from pathlib import Path
from ud_boxer.helpers import PMB, pmb_generator
from ud_boxer.sbn import SBNGraph
from ud_boxer.sbn_spec import SBN_EDGE_TYPE, SBN_NODE_TYPE

multi_boxes = []
for path in pmb_generator('../../data/pmb_dataset/pmb-extracted/pmb-4.0.0/data/en/gold/p01/', "**/*.sbn"):
    G = SBNGraph().from_path(path)
    b_nodes = len([b for b in G.nodes if b[0] == SBN_NODE_TYPE.BOX])
    if b_nodes > 1:
        multi_boxes.append(path)

0it [00:00, ?it/s]Initialized cyclic SBN graph, this will work for most tasks, but can cause problems later on when exporting to Penman for instance.
229it [00:00, 4442.77it/s]


In [1]:
lang = 'de'

In [2]:
from ud_boxer.sbn_spec import split_wn_sense
import logging
import pickle

logging.basicConfig(level=logging.ERROR)

pmb = PMB(language=lang)

lemma_to_sense = dict()
lemma_pos_to_sense = dict()

for path in pmb.generator('../../data/pmb_dataset/pmb-extracted/pmb-4.0.0/data/en/gold/', '**/*.sbn'):
    try:
        G = SBNGraph().from_path(path)
    except:
        continue
    for _, node_data in G.nodes.items():
        if node_data["type"] == SBN_NODE_TYPE.SENSE:
            sense = node_data['token']
            lemma, pos, num = split_wn_sense(sense)
            if lemma in lemma_to_sense:
                if sense in lemma_to_sense[lemma]:
                    lemma_to_sense[lemma][sense] += 1
                else:
                    lemma_to_sense[lemma][sense] = 1
            else:
                lemma_to_sense[lemma] = {sense: 1}

            lemma_pos = f'{lemma}.{pos}'
            if lemma_pos in lemma_pos_to_sense:
                if sense in lemma_pos_to_sense[lemma_pos]:
                    lemma_pos_to_sense[lemma_pos][sense] += 1
                else:
                    lemma_pos_to_sense[lemma_pos][sense] = 1
            else:
                lemma_pos_to_sense[lemma_pos] = {sense: 1}

10715it [00:02, 5124.75it/s]


In [3]:
def filter_most_common(mapping_dict):
    filtered_dict = dict()
    for lem, count_dict in mapping_dict.items():
        if len(count_dict) == 1:
            filtered_dict[lem] = list(count_dict.keys())[0]
        else:
            filtered_dict[lem] = sorted(list(count_dict.items()), key=lambda x: x[1])[-1][0]
    return filtered_dict

In [4]:
most_common_lemma_to_sense = filter_most_common(lemma_to_sense)
most_common_lemma_pos_to_sense = filter_most_common(lemma_pos_to_sense)

In [5]:
with open('data/mappings/lemma_sense_lookup_en_gold_train.pickle', 'wb') as f:
    pickle.dump(most_common_lemma_to_sense, f)

with open('data/mappings/lemma_pos_sense_lookup_en_gold_train.pickle', 'wb') as f:
    pickle.dump(most_common_lemma_pos_to_sense, f)

In [8]:
most_common_lemma_to_sense.keys()

dict_keys(['female', 'time', 'young', 'male', 'telephone', 'person', 'joke', 'event', 'hate', 'roller_skate', 'park', 'snow', 'leave', 'measure', 'day', 'overreact', 'street', 'block', 'huge', 'truck', 'videotape', 'entity', 'own', 'private', 'jet', 'vote', 'speak', 'english', 'play', 'guitar', 'tackle', 'problem', 'immediately', 'plant', 'pansy', 'flower_bed', 'go', 'city', 'music', 'dedicate', 'in_love', 'face', 'serious', 'situation', 'finish_off', 'soup', 'cow', 'abound', 'farm', 'read', 'book', 'invite', 'old', 'friend', 'have', 'bow', 'violin', 'owe', 'dollar', 'start', 'relax', 'study', 'cultivate', 'rice', 'manner', 'surname', 'pronounce', 'buy', 'microscope', 'grab', 'purse', 'house', 'year', 'score', 'be', 'proud', 'son', 'call', 'laugh', 'throw', 'pillow', 'vegetable', 'fruit', 'ship', 'unload', 'box', 'fed_up', 'story', 'man', 'jump', 'water', 'fold', 'laundry', 'take', 'taxi', 'rain', 'expendable', 'repair', 'refrigerator', 'kind_of', 'excited', 'iron', 'handkerchief', 'mr