In [1]:
import networkx as nx
from tqdm import tqdm
import random
from leafer import Leafer
import numpy as np
import pickle
import os
import glob
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn


seed = 42
random.seed(seed)
np.random.seed(seed)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
G = nx.DiGraph()
for synset in (wn.all_synsets('n')):
    name = synset.name()
    G.add_node(name)
    hyponyms = synset.hyponyms()

    for hypo in hyponyms:
        new_name = hypo.name()
        G.add_node(new_name)
        G.add_edge(name, new_name)

for synset in (wn.all_synsets('v')):
    name = synset.name()
    G.add_node(name)
    hyponyms = synset.hyponyms()

    for hypo in hyponyms:
        new_name = hypo.name()
        G.add_node(new_name)
        G.add_edge(name, new_name)


In [20]:
len(G.nodes())

95882

In [21]:
# noun_path = '../../TaxonomyEnrichment/data/noun/train.edgelist'
# verb_path = '../../TaxonomyEnrichment/data/verb/train.edgelist'

# noun = nx.read_edgelist(noun_path, create_using=nx.DiGraph, delimiter='\t')
# verb = nx.read_edgelist(verb_path, create_using=nx.DiGraph, delimiter='\t')

# G = nx.compose(noun, verb)

In [22]:
# exclude test

# from MAGS
cs_test_path = '../../TaxonomyEnrichment/data/MAG_CS/test_nodes.pickle'
psy_test_path = '../../TaxonomyEnrichment/data/psychology/test_nodes.pickle'
noun_test_path = '../../TaxonomyEnrichment/data/noun/test_nodes.pickle'
verb_test_path = '../../TaxonomyEnrichment/data/verb/test_nodes.pickle'

with open(cs_test_path, 'rb') as f:
    cs_test = pickle.load(f)

with open(psy_test_path, 'rb') as f:
    psy_test = pickle.load(f)

with open(noun_test_path, 'rb') as f:
    noun_test = pickle.load(f)

with open(verb_test_path, 'rb') as f:
    verb_test = pickle.load(f)

k = 0
for node, parents in cs_test:
    for i in range(10):
        true_name = f'{node}.n.0{i}'
        if true_name in G.nodes():
            G.remove_node(true_name)
            k += 1
    
for node, parents in psy_test:
    for i in range(10):
        true_name = f'{node}.n.0{i}'
        if true_name in G.nodes():
            G.remove_node(true_name)
            k += 1

for node, parents in verb_test:
    if node in G.nodes():
        G.remove_node(node)
        k += 1

for node, parents in noun_test:
    if node in G.nodes():
        G.remove_node(node)
        k += 1

In [23]:
k

2096

In [24]:
# from Hypernym Discovery

In [25]:
main_path = '../../SemEval2018-Task9/custom_datasets/1A.english.pickle'
medical_path = '../../SemEval2018-Task9/custom_datasets/2A.medical.pickle'
music_path = '../../SemEval2018-Task9/custom_datasets/2B.music.pickle'

with open(main_path, 'rb') as f:
    main = pickle.load(f)

with open(medical_path, 'rb') as f:
    medical = pickle.load(f)

with open(music_path, 'rb') as f:
    music = pickle.load(f)


for elem in main:
    node = elem['children'].replace(' ', '_')
    for i in range(10):
        true_name = f'{node}.n.0{i}'
        if true_name in G.nodes():
            G.remove_node(true_name)
            k += 1

for elem in medical:
    node = elem['children'].replace(' ', '_')
    for i in range(10):
        true_name = f'{node}.n.0{i}'
        if true_name in G.nodes():
            G.remove_node(true_name)
            k += 1

for elem in music:
    node = elem['children'].replace(' ', '_')
    for i in range(10):
        true_name = f'{node}.n.0{i}'
        if true_name in G.nodes():
            G.remove_node(true_name)
            k += 1

In [26]:
k

4257

In [35]:
# From TEXEVAL
data = 'environment'
env_path = "../../TExEval-2_testdata_1.2/gs_taxo/EN/" + str(data) + "_eurovoc_en.taxo"
sci_path =  "../../TExEval-2_testdata_1.2/gs_taxo/EN/" + 'science' + "_eurovoc_en.taxo"
G_test = nx.DiGraph()

with open(env_path, "r") as f:
    for line in f:
        idx, hypo, hyper = line.split("\t")
        hyper = hyper.replace("\n", "")
        G_test.add_node(hypo)
        G_test.add_node(hyper)
        G_test.add_edge(hyper, hypo)

for node in G_test.nodes():
    for i in range(10):
        true_name = f'{node}.n.0{i}'
        if true_name in G.nodes():
            G.remove_node(true_name)
            k += 1

with open(sci_path, "r") as f:
    for line in f:
        idx, hypo, hyper = line.split("\t")
        hyper = hyper.replace("\n", "")
        G_test.add_node(hypo)
        G_test.add_node(hyper)
        G_test.add_edge(hyper, hypo)

for node in G_test.nodes():
    for i in range(10):
        true_name = f'{node}.n.0{i}'
        if true_name in G.nodes():
            G.remove_node(true_name)
            k += 1

In [39]:
while True:
    try:
        cycle = nx.find_cycle(G)
        print(cycle)
        G.remove_edge(*cycle[0])
    except:
        break

[('restrain.v.01', 'inhibit.v.04'), ('inhibit.v.04', 'restrain.v.01')]


In [40]:
l = Leafer(G)


train, test = l.split_train_test(
    generation_depth=0,  # до какого уровня в топ. сортировке идти
    p=0.001,  # вероятность что подходящий случай уйдет в тест
    p_divide_leafs=0.5,
    # вероятность что листья поделим пополам трейн-тест
    # а не засунем целый случай в трейн или в тест
    min_to_test_rate=0.5,
    # минимальное количество доли вершин которых не было в
    # трейне чтобы поделить пополам на трейн-тест
    # то есть если 6\10 вершин были трейне то значит все 10 в трейн
    # если 5\10 были в трейне, то значит оставшиеся можем кинуть в тест
    weights=[0.00, 0.0, 0.0, 0.00, 0.00, 1.],
    # веса в соответствии
    # один ребенок, только листья, не только листья
    # триплеты с 2 родителями, триплеты такие что мидл нода имеет
    # 1 ребенка, предсказание родителя
    #p_parent=1
)

predict_hypernym 36775 36775
predict_hypernym 31 31


In [42]:
train[:3], test[:3]

([{'children': 'window_seat.n.01',
   'parents': 'bench.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'pummel.v.01',
   'parents': 'hit.v.03',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'spanish_oak.n.01',
   'parents': 'oak.n.02',
   'grandparents': None,
   'case': 'predict_hypernym'}],
 [{'children': 'singular_matrix.n.01',
   'parents': 'square_matrix.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'powdered_sugar.n.01',
   'parents': 'granulated_sugar.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'general_anesthesia.n.01',
   'parents': 'anesthesia.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'}])

In [43]:
def add_definitions(elem):
    if elem['case'] == 'predict_hypernym':
        elem['child_def'] = wn.synset(elem['children'].replace(' ', '_')).definition()
      #  elem['parent_def'] = wn.synset(elem['parents']).definition()
    elif elem['case'] == 'predict_multiple_hypernyms':
        elem['child_def'] = wn.synset(elem['children'].replace(' ', '_')).definition()

    elif elem['case'] == 'simple_triplet_grandparent':
        elem['child_def'] = wn.synset(elem['children'].replace(' ', '_')).definition()
        elem['grandparent_def'] = wn.synset(elem['grandparents']).definition()
    elif elem['case'] == 'only_child_leaf':
        elem['grandparent_def'] = wn.synset(elem['grandparents']).definition()
        elem['parent_def'] = wn.synset(elem['parents']).definition()
    elif elem['case'] == 'simple_triplet_2parent':
        elem['1parent_def'] = wn.synset(elem['parents'][0]).definition()
        elem['2parent_def'] = wn.synset(elem['parents'][1]).definition()
    else:
        elem['parent_def'] = wn.synset(elem['parents']).definition()

In [44]:
for i, elem in enumerate(train):
    try:
        add_definitions(elem)
    except:
        print(i, elem)
        train.remove(elem)

counter = 0
for i, elem in enumerate(test):
    try:
        add_definitions(elem)
    except:
        print(i, elem)
        counter += 1
        test.remove(elem)

print(counter)

0


In [45]:
train_out = '/home/LLM_Taxonomy/wnet/unified_clean_wnet_noun_verb_def_train.pickle'
test_out = '/home/LLM_Taxonomy/wnet/unified_clean_wnet_noun_verb_def_test.pickle'

with open(train_out, 'wb') as f:
    pickle.dump(train, f)

with open(test_out, 'wb') as f:
    pickle.dump(test, f)

# NON SELECTIVE

In [3]:
noun_path = '../../TaxonomyEnrichment/data/noun/all.edgelist'
verb_path = '../../TaxonomyEnrichment/data/verb/all.edgelist'

noun = nx.read_edgelist(noun_path, create_using=nx.DiGraph, delimiter='\t')
verb = nx.read_edgelist(verb_path, create_using=nx.DiGraph, delimiter='\t')

G = nx.compose(noun, verb)

In [4]:
l = Leafer(G)
# iterator = l.leafs_generator()

In [5]:
train, test = l.split_train_test(
    generation_depth=0,  # до какого уровня в топ. сортировке идти
    p=0.001,  # вероятность что подходящий случай уйдет в тест
    p_divide_leafs=0.5,
    # вероятность что листья поделим пополам трейн-тест
    # а не засунем целый случай в трейн или в тест
    min_to_test_rate=0.5,
    # минимальное количество доли вершин которых не было в
    # трейне чтобы поделить пополам на трейн-тест
    # то есть если 6\10 вершин были трейне то значит все 10 в трейн
    # если 5\10 были в трейне, то значит оставшиеся можем кинуть в тест
    weights=[0.00, 0.0, 0.0, 0.00, 0.00, 1.],
    # веса в соответствии
    # один ребенок, только листья, не только листья
    # триплеты с 2 родителями, триплеты такие что мидл нода имеет
    # 1 ребенка, предсказание родителя
    #p_parent=1
)

predict_hypernym 44772 44772
predict_hypernym 49 49


In [41]:
train[:3], test[:3]

([{'children': 'window_seat.n.01',
   'parents': 'bench.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'pummel.v.01',
   'parents': 'hit.v.03',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'spanish_oak.n.01',
   'parents': 'oak.n.02',
   'grandparents': None,
   'case': 'predict_hypernym'}],
 [{'children': 'singular_matrix.n.01',
   'parents': 'square_matrix.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'powdered_sugar.n.01',
   'parents': 'granulated_sugar.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'general_anesthesia.n.01',
   'parents': 'anesthesia.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'}])

In [7]:
def add_definitions(elem):
    if elem['case'] == 'predict_hypernym':
        elem['child_def'] = wn.synset(elem['children'].replace(' ', '_')).definition()
      #  elem['parent_def'] = wn.synset(elem['parents']).definition()
    elif elem['case'] == 'predict_multiple_hypernyms':
        elem['child_def'] = wn.synset(elem['children'].replace(' ', '_')).definition()

    elif elem['case'] == 'simple_triplet_grandparent':
        elem['child_def'] = wn.synset(elem['children'].replace(' ', '_')).definition()
        elem['grandparent_def'] = wn.synset(elem['grandparents']).definition()
    elif elem['case'] == 'only_child_leaf':
        elem['grandparent_def'] = wn.synset(elem['grandparents']).definition()
        elem['parent_def'] = wn.synset(elem['parents']).definition()
    elif elem['case'] == 'simple_triplet_2parent':
        elem['1parent_def'] = wn.synset(elem['parents'][0]).definition()
        elem['2parent_def'] = wn.synset(elem['parents'][1]).definition()
    else:
        elem['parent_def'] = wn.synset(elem['parents']).definition()

In [8]:
for i, elem in enumerate(train):
    try:
        add_definitions(elem)
    except:
        print(i, elem)
        train.remove(elem)

counter = 0
for i, elem in enumerate(test):
    try:
        add_definitions(elem)
    except:
        print(i, elem)
        counter += 1
        test.remove(elem)

print(counter)

0


In [10]:
train_out = '/home/LLM_Taxonomy/wnet/unified_wnet_noun_verb_def_train.pickle'
test_out = '/home/LLM_Taxonomy/wnet/unified_wnet_noun_verb_def_test.pickle'

with open(train_out, 'wb') as f:
    pickle.dump(train, f)

with open(test_out, 'wb') as f:
    pickle.dump(test, f)