In [2]:
import networkx as nx
from tqdm import tqdm
import random
from leafer import Leafer
import numpy as np
import pickle
import os
import glob
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn


seed = 42
random.seed(seed)
np.random.seed(seed)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
noun_path = '../../TaxonomyEnrichment/data/noun/train.edgelist'
verb_path = '../../TaxonomyEnrichment/data/verb/train.edgelist'

noun = nx.read_edgelist(noun_path, create_using=nx.DiGraph, delimiter='\t')
verb = nx.read_edgelist(verb_path, create_using=nx.DiGraph, delimiter='\t')

G = nx.compose(noun, verb)

In [3]:
# exclude test

# from MAGS
cs_test_path = '../../TaxonomyEnrichment/data/MAG_CS/test_nodes.pickle'
psy_test_path = '../../TaxonomyEnrichment/data/psychology/test_nodes.pickle'

with open(cs_test_path, 'rb') as f:
    cs_test = pickle.load(f)

with open(psy_test_path, 'rb') as f:
    psy_test = pickle.load(f)

k = 0
for node, parents in cs_test:
    if node in G.nodes():
        G.remove_node(node + '.n.01')
        k += 1
    
for node, parents in psy_test:
    if node in G.nodes():
        G.remove_node(node + '.n.01')
        k += 1

In [14]:
# from 

In [15]:
k

0

In [10]:
node

'sertraline overdose'

In [13]:
list(G.nodes())[:5]

['entity.n.01',
 'abstraction.n.06',
 'physical_entity.n.01',
 'thing.n.08',
 'attribute.n.02']

# NON SELECTIVE

In [3]:
noun_path = '../../TaxonomyEnrichment/data/noun/all.edgelist'
verb_path = '../../TaxonomyEnrichment/data/verb/all.edgelist'

noun = nx.read_edgelist(noun_path, create_using=nx.DiGraph, delimiter='\t')
verb = nx.read_edgelist(verb_path, create_using=nx.DiGraph, delimiter='\t')

G = nx.compose(noun, verb)

In [4]:
l = Leafer(G)
# iterator = l.leafs_generator()

In [5]:
train, test = l.split_train_test(
    generation_depth=0,  # до какого уровня в топ. сортировке идти
    p=0.001,  # вероятность что подходящий случай уйдет в тест
    p_divide_leafs=0.5,
    # вероятность что листья поделим пополам трейн-тест
    # а не засунем целый случай в трейн или в тест
    min_to_test_rate=0.5,
    # минимальное количество доли вершин которых не было в
    # трейне чтобы поделить пополам на трейн-тест
    # то есть если 6\10 вершин были трейне то значит все 10 в трейн
    # если 5\10 были в трейне, то значит оставшиеся можем кинуть в тест
    weights=[0.00, 0.0, 0.0, 0.00, 0.00, 1.],
    # веса в соответствии
    # один ребенок, только листья, не только листья
    # триплеты с 2 родителями, триплеты такие что мидл нода имеет
    # 1 ребенка, предсказание родителя
    #p_parent=1
)

predict_hypernym 44772 44772
predict_hypernym 49 49


In [6]:
train[:3], test[:3]

([{'children': 'bdellium.n.01',
   'parents': 'gum_resin.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'thinning_shears.n.01',
   'parents': 'shears.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'spotweld.v.01',
   'parents': 'weld.v.01',
   'grandparents': None,
   'case': 'predict_hypernym'}],
 [{'children': 'broad_beech_fern.n.01',
   'parents': 'beech_fern.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'impreciseness.n.01',
   'parents': 'inexactness.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'},
  {'children': 'black_guillemot.n.01',
   'parents': 'guillemot.n.01',
   'grandparents': None,
   'case': 'predict_hypernym'}])

In [7]:
def add_definitions(elem):
    if elem['case'] == 'predict_hypernym':
        elem['child_def'] = wn.synset(elem['children'].replace(' ', '_')).definition()
      #  elem['parent_def'] = wn.synset(elem['parents']).definition()
    elif elem['case'] == 'predict_multiple_hypernyms':
        elem['child_def'] = wn.synset(elem['children'].replace(' ', '_')).definition()

    elif elem['case'] == 'simple_triplet_grandparent':
        elem['child_def'] = wn.synset(elem['children'].replace(' ', '_')).definition()
        elem['grandparent_def'] = wn.synset(elem['grandparents']).definition()
    elif elem['case'] == 'only_child_leaf':
        elem['grandparent_def'] = wn.synset(elem['grandparents']).definition()
        elem['parent_def'] = wn.synset(elem['parents']).definition()
    elif elem['case'] == 'simple_triplet_2parent':
        elem['1parent_def'] = wn.synset(elem['parents'][0]).definition()
        elem['2parent_def'] = wn.synset(elem['parents'][1]).definition()
    else:
        elem['parent_def'] = wn.synset(elem['parents']).definition()

In [8]:
for i, elem in enumerate(train):
    try:
        add_definitions(elem)
    except:
        print(i, elem)
        train.remove(elem)

counter = 0
for i, elem in enumerate(test):
    try:
        add_definitions(elem)
    except:
        print(i, elem)
        counter += 1
        test.remove(elem)

print(counter)

0


In [10]:
train_out = '/home/LLM_Taxonomy/wnet/unified_wnet_noun_verb_def_train.pickle'
test_out = '/home/LLM_Taxonomy/wnet/unified_wnet_noun_verb_def_test.pickle'

with open(train_out, 'wb') as f:
    pickle.dump(train, f)

with open(test_out, 'wb') as f:
    pickle.dump(test, f)