In [1]:
import networkx as nx
import itertools
from tqdm import tqdm
import pickle

In [17]:
path = '../gs_taxo/EN/environment_eurovoc_en.taxo'
G = nx.DiGraph()

with open(path, 'r') as f:
    for line in f:
        idx, hypo, hyper = line.split('\t')
        hyper = hyper.replace('\n', '')
        G.add_node(hypo)
        G.add_node(hyper)
        G.add_edge(hyper, hypo)

In [25]:
def clean_dict(pairs, use_lemma, reverse):
    new_pairs = {}
    for key, val in pairs.items():
        if use_lemma:
            term = key[0].split("(")[0].strip()
        else:
            term = key[0]
        target = key[1].split(",")[0]
        new_key = (target, term) if reverse else (term, target)
        new_pairs[new_key] = val

    return new_pairs


in_name = '../data/env/eurovoc/pred_hypernym/lemmas_ppl_clear.pickle'
lemma = True
reverse = True

with open(in_name, "rb") as f:
    ppls = pickle.load(f)

ppls_pairs = clean_dict(ppls, use_lemma=lemma, reverse=reverse)

root = 'environment'
all_verteces = list(G.nodes)
all_verteces.remove(root)

In [26]:
def get_graph(ppl_pairs, thr):
    S = nx.DiGraph()
    for key, val in ppl_pairs.items():
        if val <thr:
            S.add_edge(key[0], key[1], weight=val)
    return S
        

G_pred = get_graph(ppls_pairs, 10)

P = len(set(G.edges()) & set(G_pred.edges())) / (len(set(G_pred.edges())) + 1e-15)
R = len(set(G.edges()) & set(G_pred.edges())) / len(set(G.edges()))
# print(len(set(edges)))
F = (2 * P * R) / (P + R + 1e-15)
print(F)


0.10354374307862665


In [27]:
def resolve_cycle(cur_G, cycle):

    cycle_ppls = {}

    for u,v in cycle:
        val = cur_G[u][v]['weight']
        cycle_ppls[(u,v)] = val

    highest_ppl = sorted(cycle_ppls.items(), key = lambda x: x[1], reverse=True)[0][0]
    cur_G.remove_edge(*highest_ppl)

pbar = tqdm()
cycles = []
while True:
    try:
        cycle = nx.find_cycle(G_pred)
        if len(cycle) > 2:
            cycles.append(cycle)

        resolve_cycle(G_pred, cycle)
    except nx.NetworkXNoCycle:
        break

P = len(set(G.edges()) & set(G_pred.edges())) / (len(set(G_pred.edges())) + 1e-15)
R = len(set(G.edges()) & set(G_pred.edges())) / len(set(G.edges()))
# print(len(set(edges)))
F = (2 * P * R) / (P + R + 1e-15)
print(F)


0it [00:21, ?it/s]


0.1264157837047861


In [28]:
def simple_triplets_generator(G):
    """
    Generator function that returns triplets with condition: middle node has only one child
    """
    for node, degree in G.out_degree():
        if (
            degree >= 1
        ):
            for child in G.successors(node):
                yield (node, child, list(G.successors(child))[0])

all_triplets = []

for cycle in cycles:
    temp_G = nx.DiGraph()
    for u, v in cycle:
        temp_G.add_node(u)
        temp_G.add_node(v)
        temp_G.add_edge(u, v)
    
    gen = simple_triplets_generator(temp_G)
    for triplet in gen:
        all_triplets.append(triplet)

In [29]:
len(set(all_triplets))

1741

In [30]:
out_name = '../triplets_env/lemmas_c_triplets.pickle'
with open(out_name, 'wb') as f:
    pickle.dump(list(set(all_triplets)), f)