In [58]:
import pickle
import networkx as nx
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
import sys

sys.path.append('../')
from cycle_refinment import resolve_graph_cycles
from multiparent_refinment import resolve_multiple_parents
from conflict_refinment import refine_conflict

In [161]:
def clean_dict(pairs, use_lemma, reverse):
    new_pairs = {}
    for key, val in pairs.items():
        if use_lemma:
            term = key[0].split("(")[0].strip()
        else:
            term = key[0]
        target = key[1].split(",")[0]
        new_key = (target, term) if reverse else (term, target)
        new_pairs[new_key] = val

    return new_pairs


data = 'science'
in_name = '/home/LLM_Taxonomy/TExEval-2_testdata_1.2/data/sci/pred_hypernym/unified_lemmas_ppl.pickle'

# sci env
path = "../gs_taxo/EN/" + str(data) + "_eurovoc_en.taxo"
# food
#path = "../gs_taxo/EN/" + str(data) + "_wordnet_en.taxo"
G = nx.DiGraph()

with open(path, "r") as f:
    for line in f:
        idx, hypo, hyper = line.split("\t")
        hyper = hyper.replace("\n", "")
        G.add_node(hypo)
        G.add_node(hyper)
        G.add_edge(hyper, hypo)

with open(in_name, "rb") as f:
    ppls = pickle.load(f)

ppls_pairs = clean_dict(ppls, use_lemma=True, reverse=True)

In [162]:
def get_graph(ppl_pairs, thr):
    S = nx.DiGraph()
    for key, val in ppl_pairs.items():
        if val < thr:
            S.add_edge(key[0], key[1], weight=val)
    return S

In [169]:
thrs = np.arange(1, 6, 0.01)
best = 0
best_thr = 0
insertions = {}

for thr in tqdm(thrs):

    G_pred = get_graph(ppls_pairs, thr)

    resolve_graph_cycles(G_pred, insertions)
    resolve_multiple_parents(
        G_pred,
        enable_mixing=False,
        ppl_compare=1,
        helper=1,
        mix_thr=1,
        n=2,
    )
    P = len(set(G.edges()) & set(G_pred.edges())) / (
                            len(set(G_pred.edges())) + 1e-15
    )
    R = len(set(G.edges()) & set(G_pred.edges())) / len(set(G.edges()))
    F = (2 * P * R) / (P + R + 1e-15)

    if F > best:
        best = F
        best_thr = thr

  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [00:03<00:00, 149.55it/s]


In [170]:
best, best_thr

(0.44334975369458085, 2.160000000000001)

In [98]:
G_pred = get_graph(ppls_pairs, best_thr)
P = len(set(G.edges()) & set(G_pred.edges())) / (
                        len(set(G_pred.edges())) + 1e-15
)
R = len(set(G.edges()) & set(G_pred.edges())) / len(set(G.edges()))
F = (2 * P * R) / (P + R + 1e-15)

In [99]:
P, R, F

(0.5459183673469388, 0.4885844748858447, 0.515662650602409)

In [100]:
 len(G.nodes()), len(G.edges())

(1486, 1533)

In [101]:
len(G_pred.nodes()), len(G_pred.edges())

(1132, 1372)

In [102]:
len(list(nx.weakly_connected_components(G_pred)))

51

In [103]:
num_parents_ls = []
num_child_ls = []
num_no_conn = 0

for node in G.nodes():
    try:
        num_parents = G_pred.in_degree()[node]
        num_child = G_pred.out_degree()[node]
    except KeyError:
        num_parents = num_child = 0

    num_parents_ls.append(num_parents)
    num_child_ls.append(num_child)
    if (num_parents == 0) and (num_child == 0):
        num_no_conn += 1

In [104]:
num_no_conn

354

In [105]:
np.array(num_parents_ls).mean(),np.array(num_child_ls).mean()

(0.9232839838492598, 0.9232839838492598)

In [106]:
G

<networkx.classes.digraph.DiGraph at 0x7f89700cee30>

In [107]:
paths = []
no_parent = 0

for node in G_pred.nodes():
    real_parents = list(G.predecessors(node))
    try:
        parent = real_parents[0]
    except IndexError:
        continue

    if not parent in G_pred.nodes():
        no_parent += 1
        continue

    try:
        paths.append(nx.shortest_path_length(G_pred, parent, node))
    except nx.NetworkXNoPath:
        paths.append(-1)

In [108]:
sum(np.array(paths) == -1), len(paths), len(paths) -sum(np.array(paths) == -1) ,  no_parent, np.array(paths)[np.array(paths) != -1].mean()

(308, 1092, 784, 39, 1.0612244897959184)

4