In [1]:
import networkx as nx
from tqdm import tqdm
import random
from leafer import Leafer
import numpy as np
import pickle
import os
import glob

In [2]:
# G = nx.read_edgelist("./IsA_Graph2.edgelist", delimiter=" ", create_using=nx.DiGraph)
G = nx.read_edgelist("only_en_wordnet.edgelist", delimiter="\t", create_using=nx.DiGraph)

In [3]:
l = Leafer(G)
# iterator = l.leafs_generator()

In [15]:
train, test = l.split_train_test(
    generation_depth=5,  # до какого уровня в топ. сортировке идти
    p=0.05,  # вероятность что подходящий случай уйдет в тест
    p_divide_leafs=0.5,
    # вероятность что листья поделим пополам трейн-тест
    # а не засунем целый случай в трейн или в тест
    min_to_test_rate=0.3,
    # минимальное количество доли вершин которых не было в
    # трейне чтобы поделить пополам на трейн-тест
    # то есть если 6\10 вершин были трейне то значит все 10 в трейн
    # если 5\10 были в трейне, то значит оставшиеся можем кинуть в тест
    weights=[0.01, 0.49, 0.4, 0.05, 0.05],
    # веса в соответствии
    # один ребенок, только листья, не только листья
    # триплеты с 2 родителями, триплеты такие что мидл нода имеет
    # 1 ребенка
)

In [16]:
len(train), len(test)

(15538, 766)

In [17]:
ls = [elem for elem in test if elem["case"] == "only_leafs_divided"]
len(ls)

87

In [18]:
train_count = {}
for elem in train:
    if elem["case"] in train_count.keys():
        train_count[elem["case"]] += 1

    else:
        train_count[elem["case"]] = 1

test_count = {}
for elem in test:
    if elem["case"] in test_count.keys():
        test_count[elem["case"]] += 1

    else:
        test_count[elem["case"]] = 1

In [19]:
train_count, test_count

({'only_leafs_all': 2036,
  'only_leafs_divided': 1864,
  'leafs_and_no_leafs': 2646,
  'simple_triplet_2parent': 1366,
  'only_child_leaf': 4086,
  'simple_triplet_grandparent': 3540},
 {'leafs_and_no_leafs': 128,
  'only_leafs_all': 96,
  'only_leafs_divided': 87,
  'simple_triplet_grandparent': 186,
  'simple_triplet_2parent': 49,
  'only_child_leaf': 220})

In [36]:
test[:5]

[{'children': ['three-dimensional_radar.n.1', 'Doppler_radar.n.1'],
  'parents': 'radar.n.1',
  'grandparents': None,
  'case': 'only_leafs_divided',
 {'children': ['cabbage_palm.n.2',
   'cabbage_palm.n.3',
   'cabbage_palm.n.4',
   'coconut.n.3',
   'corozo.n.1',
   'fishtail_palm.n.1',
   'nipa_palm.n.1',
   'royal_palm.n.1'],
  'parents': 'palm.n.3',
  'grandparents': None,
  'case': 'leafs_and_no_leafs'},
 {'children': 'Western.n.1',
  'parents': 'feature.n.3',
  'grandparents': 'movie.n.1',
  'case': 'simple_triplet_grandparent'},
 {'children': 'farm_team.n.1',
  'parents': 'minor-league_team.n.1',
  'grandparents': 'team.n.1',
  'case': 'only_child_leaf'},
 {'children': 'movie.n.1',
  'parents': ['show.n.3', 'product.n.2'],
  'grandparents': None,
  'case': 'simple_triplet_2parent'}]

In [20]:
num_leaks = 0
for vertex in l.collector.test_verteces:
    if vertex in l.collector.train_verteces:
        num_leaks += 1

In [21]:
num_leaks

0

In [22]:
num_leaks = 0
for vertex in l.collector.train_verteces:
    if vertex in l.collector.test_verteces:
        num_leaks += 1

In [23]:
num_leaks

0

In [24]:
lang = 'en'
name_train = "../babel_datasets/reweighted_wnet_train_" + lang + "_babel.pickle"
name_test = "../babel_datasets/reweighted_wnet_test_" + lang + "_babel.pickle"

with open(name_train, "wb") as handle:
    pickle.dump(train, handle)

with open(name_test, "wb") as handle:
    pickle.dump(test, handle)

In [41]:
all_data = glob.glob(os.path.join(".", "*isa.edgelist"))
all_data

['./fr_isa.edgelist',
 './it_isa.edgelist',
 './en_isa.edgelist',
 './de_isa.edgelist',
 './es_isa.edgelist',
 './ru_isa.edgelist']

In [21]:
global_train = []
global_test = []

all_data = glob.glob(os.path.join(".", "*isa.edgelist"))
for path in tqdm(all_data):
    lang = path.replace("./", "").replace("_isa.edgelist", "")

    G = nx.read_edgelist(path, delimiter="\t", create_using=nx.DiGraph)
    l = Leafer(G)
    train, test = l.split_train_test(
        generation_depth=1,
        p=0.05,
        p_divide_leafs=0.5,
        min_to_test_rate=0.5,
        weights=[0.25, 0.3, 0.25, 0.1, 0.1],
    )

    name_train = "../babel_datasets/train_" + lang + "_babel.pickle"
    name_test = "../babel_datasets/test_" + lang + "_babel.pickle"

    with open(name_train, "wb") as handle:
        pickle.dump(train, handle)

    with open(name_test, "wb") as handle:
        pickle.dump(test, handle)

    global_train.extend(train)
    global_test.extend(test)

100%|██████████| 6/6 [00:14<00:00,  2.40s/it]


In [23]:
len(global_test), len(global_train)

(4385, 102865)

In [31]:
max_len = 0
k = 0
for vert in global_train:
    cur_len = len(vert["children"])
    if cur_len > 50:
        k += 1
        max_len = cur_len
        max_i = vert

In [33]:
max_i["parents"]

['хоккей_на_траве.n.1', 'Летние_Олимпийские_игры_1972.n.1']

In [17]:
def predict_child_with_parent_and_grandparent(elem):
    """
    hyperhypenym: arthropod.n.01,
    hypernym: insect.n.01, hyponyms:
    (blackly)

    hyperhypenym: elem['grandparents'],
    hypernym: elem['parents'], hyponyms:
    elem['children']

    Fly is a hyponym for the word “insect".
    Predict hyponyms for the word “fly”. Answer:
    """

    # transformed_term = (
    #     "hyperhypenym: "
    #     + ", ".join(elem["grandparents"])
    #     + ", hypernym: "
    #     + elem["parents"]
    #     + ", hyponyms:"
    # )
    transformed_term = (
        ", ".join(elem["grandparents"])
        + " are hyponyms for the word '"
        + elem["parents"]
        + "'. Predict hyponyms for the word '"
        + elem["parents"]
        + "'. Answer:"
    )
    return transformed_term