In [18]:
import networkx as nx
import pandas as pd
import os
from nltk.corpus import wordnet as wn
from tqdm import tqdm

In [19]:
isa_path = "IsA.csv"

if not os.path.exists(isa_path):
    df = pd.read_csv("processed_en.csv", index_col=0)
    df1 = df[df["Relation"] == "IsA"]
    df1 = df1[~(df1["To"] + df1["From"]).duplicated()]
    df1 = df1[["Relation", "To", "From"]]
    df1.drop(columns=["Relation"]).to_csv(isa_path, sep="\t", header=False, index=False)

In [20]:
G = nx.read_edgelist("IsA.csv", delimiter="\t", create_using=nx.DiGraph)

In [60]:
def detect_bad_edge(cycle):
    pair_level = {}
    for pair in cycle:
        vertex, child = pair
        all_meanings = wn.synsets(vertex)

        found_in_wordnet = False
        for level, meaning in enumerate(all_meanings):
            all_hyponyms = meaning.hyponyms()
            for hyponym in all_hyponyms:
                # print(child,hyponym.lemma_names())
                if child in hyponym.lemma_names():
                    found_in_wordnet = True
                    pair_level[pair] = level + 1
                    break
            if found_in_wordnet:
                break

        if not found_in_wordnet:
            resulting_pair = pair
            return resulting_pair

    return sorted(pair_level.items(), key=lambda x: x[1], reverse=True)[0][0]


def generator():
    while not nx.is_directed_acyclic_graph(G):
        cycle = nx.find_cycle(G)
        yield detect_bad_edge(cycle)

In [62]:
for bad_edge in tqdm(generator()):
    G.remove_edge(*bad_edge)

[('ghost', 'spirit'), ('spirit', 'apparition'), ('apparition', 'ghost')]