In [1]:
from pathlib import Path
import pandas as pd

# Edge mappings are only based on isomorphic graphs from the en gold train dataset
PATH = Path('../data/mappings/de_edge_mappings_2022_05_24_20_22_10.csv')
df = pd.read_csv(PATH)

In [2]:
df.deprel.unique()

array(['nsubj', nan, 'advmod', 'obl', 'nummod', 'obj', 'nsubj:pass',
       'conj', 'amod', 'iobj', 'nmod', 'xcomp'], dtype=object)

In [3]:
# Drop nodes without pos tags
df = df[(df.from_node_upos.notnull()) & (df.to_node_upos.notnull()) & (df.deprel.notnull())]

In [4]:
df

Unnamed: 0,from_node_token,from_node_Mood,from_node_Number,from_node_Person,from_node_Tense,from_node_VerbForm,from_node_lemma,from_node_upos,from_node_xpos,to_node_token,...,label,doc_id,from_node_Case,from_node_PronType,from_node_Reflex,from_node_NumType,from_node_Gender,to_node_NumType,to_node_Definite,to_node_Polarity
0,starren_an.v.01,Ind,Sing,3.0,Past,Fin,starren,VERB,VVFIN,female.n.02,...,Agent,de/p96/d1574,,,,,,,,
8,ohrfeigen.v.01,Ind,Sing,3.0,Past,Fin,ohrfeigen,VERB,VVFIN,female.n.02,...,Agent,de/p96/d0858,,,,,,,,
15,dauern.v.01,Ind,Sing,3.0,Past,Fin,dauern,VERB,VVFIN,Versammlung.n.01,...,Theme,de/p96/d0909,,,,,,,,
19,unvoreingenommen.a.01,,,,,,unvoreingenommen,ADJ,ADJD,male.n.02,...,AttributeOf,de/p96/d2656,,,,,,,,
21,unvoreingenommen.a.01,,,,,,unvoreingenommen,ADJ,ADJD,äußerst.r.01,...,Degree,de/p96/d2656,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1973,blasen.v.01,Ind,Sing,3.0,Pres,Fin,blasen,VERB,VVFIN,Wind.n.01,...,Theme,de/p56/d2288,,,,,,,,
1977,kaufen.v.01,Ind,Plur,1.0,Pres,Fin,kaufen,VERB,VVFIN,CD.n.01,...,Theme,de/p56/d1914,,,,,,,,
1980,hereinlegen.v.01,,,,,Part,hereinlegen,VERB,VVPP,male.n.02,...,Agent,de/p56/d3290,,,,,,,,
1981,hereinlegen.v.01,,,,,Part,hereinlegen,VERB,VVPP,person.n.01,...,Time,de/p56/d3290,,,,,,,,


In [5]:
def create_key(from_upos, deprel, to_upos):
    return '-'.join([from_upos, deprel, to_upos])

mapping = dict()
for _, row in df.iterrows():
    label = row.label
    key = create_key(row.from_node_upos, row.deprel, row.to_node_upos)
    if key in mapping:
        if label in mapping[key]:
            mapping[key][label] += 1
        else:
            mapping[key][label] = 1
    else:
        mapping[key] = {label: 1}

In [6]:
def filter_most_common(mapping_dict):
    filtered_dict = dict()
    for lem, count_dict in mapping_dict.items():
        if len(count_dict) == 1:
            filtered_dict[lem] = list(count_dict.keys())[0]
        else:
            filtered_dict[lem] = sorted(list(count_dict.items()), key=lambda x: x[1])[-1][0]
    return filtered_dict

In [7]:
most_common = filter_most_common(mapping)

In [8]:
import json

lang = 'de'

with open(f'{lang}_edge_mappings_train.json', 'w') as f:
    json.dump(most_common, f, indent=2)