In [25]:
from pathlib import Path
import pandas as pd

df = pd.read_csv('data/edge_classifier/nl_edge_mappings_2022_05_24_20_22_21.csv')

In [26]:
df.head()

Unnamed: 0,from_node_token,from_node_Gender,from_node_Number,from_node_lemma,from_node_upos,from_node_xpos,to_node_token,deprel,label,doc_id,...,to_node_Person,to_node_PronType,from_node_Case,from_node_Person,from_node_PronType,to_node_Degree,to_node_Poss,to_node_Tense,to_node_VerbForm,from_node_Poss
0,female.n.02,Com,Sing,Mary,PROPN,N|eigen|ev|basis|zijd|stan,Mary,,Name,nl/p96/d2299,...,,,,,,,,,,
1,molken.v.01,,,molken,VERB,WW|vd|vrij|zonder,female.n.02,nsubj,Agent,nl/p96/d2299,...,,,,,,,,,,
2,molken.v.01,,,molken,VERB,WW|vd|vrij|zonder,time.n.08,,Time,nl/p96/d2299,...,,,,,,,,,,
3,time.n.08,,,,,,now,,EQU,nl/p96/d1684,...,,,,,,,,,,
4,gelukkig.a.01,,,gelukkig,ADJ,ADJ|vrij|basis|zonder,time.n.08,,Time,nl/p96/d1684,...,,,,,,,,,,


In [27]:
df[df.from_node_lemma.notna()]['from_node_lemma']

0             Mary
1           molken
2           molken
4         gelukkig
5      terug_slaan
          ...     
842         gitaar
844             ik
845      begrijpen
847           zijn
848           zijn
Name: from_node_lemma, Length: 650, dtype: object

In [28]:
from ud_boxer.sbn_spec import split_synset_id

lemma_to_sense = dict()
lemma_pos_to_sense = dict()

for _, row in df.iterrows():
    for n in ['from', 'to']:
        sense = row[f"{n}_node_token"]
        if not (components := split_synset_id(sense)):
            continue
        
        _, pos, num = components
        lemma = row[f'{n}_node_lemma']
        if lemma in lemma_to_sense:
            if sense in lemma_to_sense[lemma]:
                lemma_to_sense[lemma][sense] += 1
            else:
                lemma_to_sense[lemma][sense] = 1
        else:
            lemma_to_sense[lemma] = {sense: 1}

        lemma_pos = f'{lemma}.{pos}'
        if lemma_pos in lemma_pos_to_sense:
            if sense in lemma_pos_to_sense[lemma_pos]:
                lemma_pos_to_sense[lemma_pos][sense] += 1
            else:
                lemma_pos_to_sense[lemma_pos][sense] = 1
        else:
            lemma_pos_to_sense[lemma_pos] = {sense: 1}

In [29]:
def filter_most_common(mapping_dict):
    filtered_dict = dict()
    for lem, count_dict in mapping_dict.items():
        if len(count_dict) == 1:
            filtered_dict[lem] = list(count_dict.keys())[0]
        else:
            filtered_dict[lem] = sorted(list(count_dict.items()), key=lambda x: x[1])[-1][0]
    return filtered_dict

In [30]:
most_common_lemma_to_sense = filter_most_common(lemma_to_sense)
most_common_lemma_pos_to_sense = filter_most_common(lemma_pos_to_sense)

In [31]:
import json

with open('data/mappings/nl_lemma_sense_lookup_train.json', 'w') as f:
    json.dump(most_common_lemma_to_sense, f, indent=4)

with open('data/mappings/nl_lemma_pos_sense_lookup_train.json', 'w') as f:
    json.dump(most_common_lemma_pos_to_sense, f, indent=4)

In [11]:
lemma_pos_to_sense

{'starren.v': {'starren_an.v.01': 4},
 'Sie|sie.n': {'female.n.02': 52, 'person.n.01': 8},
 'nan.n': {'time.n.08': 1106},
 'ich.n': {'person.n.01': 131},
 'glücklich.a': {'glücklich.a.01': 6},
 'verloben.a': {'verloben.a.01': 3},
 'ohrfeigen.v': {'ohrfeigen.v.01': 2},
 'bellen.v': {'bellen.v.01': 2, 'bellen_an.v.01': 1},
 'dauern.v': {'dauern.v.01': 4},
 'Versammlung.n': {'Versammlung.n.01': 1},
 'unvoreingenommen.a': {'unvoreingenommen.a.01': 3},
 'er.n': {'male.n.02': 72},
 'äußerst.r': {'äußerst.r.01': 3},
 'süß.a': {'süß.a.01': 2},
 'Banan.n': {'Banan.n.01': 1},
 'setzen.v': {'setzen.v.01': 2},
 'Pferd.n': {'Pferd.n.01': 1},
 'wiegen.v': {'wiegen.v.01': 2},
 '140.n': {'140.n.01': 2},
 'etwa.r': {'etwa.r.01': 2},
 'Kilogramm.n': {'Kilogramm.n.01': 1},
 'lösen.v': {'lösen.v.01': 4},
 'Rätsel.n': {'Rätsel.n.01': 1},
 'Tom.n': {'female.n.02': 40},
 'bießen.v': {'bießen.v.01': 2},
 'Mary.n': {'female.n.02': 11},
 'wimmern.v': {'wimmern.v.01': 1},
 'Schmerz.n': {'Schmerz.n.01': 2},
 'Gia