In [4]:
import sys
import numpy as np
import pickle
import os
import glob
import networkx as nx
from tqdm import tqdm
import random

sys.path.append('../../')
from DataConstructor.notebooks.leafer import Leafer



seed = 42
random.seed(seed)
np.random.seed(seed)

In [5]:
G = nx.read_edgelist("../../TaxonomyEnrichment/data/MAG_CS/train.edgelist", delimiter="\t", create_using=nx.DiGraph)

In [6]:
while True:
    try:
        cycle = nx.find_cycle(G)
        print(cycle)
        G.remove_edge(*cycle[0])
    except:
        break

new_labels = {}
for node in G.nodes():
    new_labels[node] = node + '.n.1'

G_new = nx.relabel_nodes(G, new_labels)

l = Leafer(G_new)


[('lock', 'lock')]
[('memory management', 'flat memory model'), ('flat memory model', 'virtual memory'), ('virtual memory', 'memory management')]
[('memory management', 'memory map'), ('memory map', 'virtual memory'), ('virtual memory', 'memory management')]
[('obfuscation', 'obfuscation')]
[('tautology', 'tautology')]
[('aliasing', 'aliasing')]
[('change management', 'change management')]
[('intelligence cycle', 'intelligence cycle')]
[('fragmentation', 'fragmentation')]
[('kernel', 'kernel')]
[('kernel', 'kernel method'), ('kernel method', 'radial basis function kernel'), ('radial basis function kernel', 'kernel')]
[('kernel', 'radial basis function kernel'), ('radial basis function kernel', 'kernel')]
[('kernel', 'kernel embedding of distributions'), ('kernel embedding of distributions', 'kernel')]
[('polynomial kernel', 'kernel'), ('kernel', 'polynomial kernel')]
[('kernel', 'kernel principal component analysis'), ('kernel principal component analysis', 'kernel')]
[('critical mass'

In [7]:
train, test = l.split_train_test(
    generation_depth=0,  # до какого уровня в топ. сортировке идти
    p=0.0,  # вероятность что подходящий случай уйдет в тест
    p_divide_leafs=0.5,
    # вероятность что листья поделим пополам трейн-тест
    # а не засунем целый случай в трейн или в тест
    min_to_test_rate=0.5,
    # минимальное количество доли вершин которых не было в
    # трейне чтобы поделить пополам на трейн-тест
    # то есть если 6\10 вершин были трейне то значит все 10 в трейн
    # если 5\10 были в трейне, то значит оставшиеся можем кинуть в тест
    weights=[0.00, 0.0, 0.0, 0.00, 0.00, 1.],
    # веса в соответствии
    # один ребенок, только листья, не только листья
    # триплеты с 2 родителями, триплеты такие что мидл нода имеет
    # 1 ребенка, предсказание родителя
    #p_parent=1
)

predict_hypernym 24706 24706


In [12]:
test_path = '../data/MAG_CS/test_nodes.pickle'
with open(test_path, 'rb') as f:
    test_nodes = pickle.load(f)

In [13]:
test_nodes[:5]

[('spanish verbs', ['verb']),
 ('effective transmission rate', ['wireless', 'channel']),
 ('achterbahn', ['stream cipher', 'cryptanalysis']),
 ('yukagir language', ['verb']),
 ('toroidal coordinates',
  ['elliptic coordinate system', 'parabolic coordinates'])]

In [15]:
test = []

for child, parents in test_nodes:
    elem = {}
    elem["children"] = child
    elem["grandparents"] = None
    if len(parents) == 1:
        elem["parents"] = parents[0]
        elem["case"] = "predict_hypernym"
    else:
        elem["parents"] = parents
        elem["case"] = "predict_multiple_hypernyms"

    test.append(elem)

In [16]:
test[:5]

[{'children': 'spanish verbs',
  'grandparents': None,
  'parents': 'verb',
  'case': 'predict_hypernym'},
 {'children': 'effective transmission rate',
  'grandparents': None,
  'parents': ['wireless', 'channel'],
  'case': 'predict_multiple_hypernyms'},
 {'children': 'achterbahn',
  'grandparents': None,
  'parents': ['stream cipher', 'cryptanalysis'],
  'case': 'predict_multiple_hypernyms'},
 {'children': 'yukagir language',
  'grandparents': None,
  'parents': 'verb',
  'case': 'predict_hypernym'},
 {'children': 'toroidal coordinates',
  'grandparents': None,
  'parents': ['elliptic coordinate system', 'parabolic coordinates'],
  'case': 'predict_multiple_hypernyms'}]

In [17]:
train[:5]

[{'children': 'cabinet card.n.1',
  'parents': 'studio.n.1',
  'grandparents': None,
  'case': 'predict_hypernym'},
 {'children': 'epoll.n.1',
  'parents': 'gnu linux.n.1',
  'grandparents': None,
  'case': 'predict_hypernym'},
 {'children': 'maximal information coefficient.n.1',
  'parents': 'mutual information.n.1',
  'grandparents': None,
  'case': 'predict_hypernym'},
 {'children': 'smicrideinae.n.1',
  'parents': 'hydropsychidae.n.1',
  'grandparents': None,
  'case': 'predict_hypernym'},
 {'children': 'point to point protocol over ethernet.n.1',
  'parents': 'ethernet.n.1',
  'grandparents': None,
  'case': 'predict_hypernym'}]

In [18]:
with open('../data/MAG_CS/test_hypernyms.pickle', 'wb') as f:
    pickle.dump(test, f)

with open('../data/MAG_CS/train_hypernyms.pickle', 'wb') as f:
    pickle.dump(train, f)