In [3]:
import networkx as nx
from tqdm import tqdm
import random
from leafer import Leafer
import numpy as np
import pickle
import os
import glob

In [4]:
# G = nx.read_edgelist("./IsA_Graph2.edgelist", delimiter=" ", create_using=nx.DiGraph)
G = nx.read_edgelist("en_isa.edgelist", delimiter="\t", create_using=nx.DiGraph)

In [5]:
l = Leafer(G)
# iterator = l.leafs_generator()

In [6]:
train, test = l.split_train_test(
    generation_depth=1,  # до какого уровня в топ. сортировке идти
    p=0.05,  # вероятность что подходящий случай уйдет в тест
    p_divide_leafs=0.6,
    # вероятность что листья поделим пополам трейн-тест
    # а не засунем целый случай в трейн или в тест
    min_to_test_rate=0.5,
    # минимальное количество доли вершин которых не было в
    # трейне чтобы поделить пополам на трейн-тест
    # то есть если 6\10 вершин были трейне то значит все 10 в трейн
    # если 5\10 были в трейне, то значит оставшиеся можем кинуть в тест
    weights=[0.25, 0.3, 0.25, 0.1, 0.1],
    # веса в соответствии
    # один ребенок, только листья, не только листья
    # триплеты с 2 родителями, триплеты такие что мидл нода имеет
    # 1 ребенка
)

In [7]:
len(train), len(test)

(24502, 1223)

In [8]:
ls = [elem for elem in test if elem["case"] == "only_leafs_divided"]
len(ls)

171

In [9]:
train_count = {}
for elem in train:
    if elem["case"] in train_count.keys():
        train_count[elem["case"]] += 1

    else:
        train_count[elem["case"]] = 1

test_count = {}
for elem in test:
    if elem["case"] in test_count.keys():
        test_count[elem["case"]] += 1

    else:
        test_count[elem["case"]] = 1

In [10]:
train_count, test_count

({'only_child_leaf': 5370,
  'simple_triplet_2parent': 3483,
  'leafs_and_no_leafs': 4307,
  'only_leafs_all': 2222,
  'simple_triplet_grandparent': 6074,
  'only_leafs_divided': 3046},
 {'only_child_leaf': 271,
  'leafs_and_no_leafs': 208,
  'only_leafs_divided': 171,
  'simple_triplet_grandparent': 312,
  'only_leafs_all': 122,
  'simple_triplet_2parent': 139})

In [11]:
train

[{'children': 'mate.n.6',
  'parents': 'singleton.n.1',
  'grandparents': 'one.n.1',
  'case': 'only_child_leaf'},
 {'children': 'ethyl_alcohol.n.1',
  'parents': ['alcohol.n.2', 'plant_product.n.1'],
  'grandparents': None,
  'case': 'simple_triplet_2parent'},
 {'children': 'daisy_cutter.n.2',
  'parents': 'ball.n.11',
  'grandparents': 'baseball.n.1',
  'case': 'only_child_leaf'},
 {'children': ['zero.n.2',
   'three.n.1',
   'four.n.1',
   'five.n.1',
   'six.n.1',
   'seven.n.1',
   'eight.n.1',
   'nine.n.1',
   'binary_digit.n.1',
   'decimal_digit.n.1',
   'duodecimal_digit.n.1',
   'hexadecimal_digit.n.1',
   'octal_digit.n.1',
   'significant_digit.n.1',
   'one.n.1',
   'two.n.1'],
  'parents': 'digit.n.1',
  'grandparents': None,
  'case': 'leafs_and_no_leafs'},
 {'children': ['craps.n.1', 'couple.n.4'],
  'parents': 'two.n.1',
  'grandparents': None,
  'case': 'leafs_and_no_leafs'},
 {'children': ['lauryl_alcohol.n.1',
   'allyl_alcohol.n.1',
   'amyl_alcohol.n.1',
   'buty

In [12]:
num_leaks = 0
for vertex in l.collector.test_verteces:
    if vertex in l.collector.train_verteces:
        num_leaks += 1

In [13]:
num_leaks

0

In [14]:
num_leaks = 0
for vertex in l.collector.train_verteces:
    if vertex in l.collector.test_verteces:
        num_leaks += 1

In [15]:
num_leaks

0

In [21]:
global_train = []
global_test = []

all_data = glob.glob(os.path.join(".", "*isa.edgelist"))
for path in tqdm(all_data):
    lang = path.replace("./", "").replace("_isa.edgelist", "")

    G = nx.read_edgelist(path, delimiter="\t", create_using=nx.DiGraph)
    l = Leafer(G)
    train, test = l.split_train_test(
        generation_depth=1,
        p=0.05,
        p_divide_leafs=0.5,
        min_to_test_rate=0.5,
        weights=[0.25, 0.3, 0.25, 0.1, 0.1],
    )

    name_train = "../babel_datasets/train_" + lang + "_babel.pickle"
    name_test = "../babel_datasets/test_" + lang + "_babel.pickle"

    with open(name_train, "wb") as handle:
        pickle.dump(train, handle)

    with open(name_test, "wb") as handle:
        pickle.dump(test, handle)

    global_train.extend(train)
    global_test.extend(test)

100%|██████████| 6/6 [00:14<00:00,  2.40s/it]


In [23]:
len(global_test), len(global_train)

(4385, 102865)

In [31]:
max_len = 0
k = 0
for vert in global_train:
    cur_len = len(vert["children"])
    if cur_len > 50:
        k += 1
        max_len = cur_len
        max_i = vert

In [33]:
max_i["parents"]

['хоккей_на_траве.n.1', 'Летние_Олимпийские_игры_1972.n.1']

In [17]:
def predict_child_with_parent_and_grandparent(elem):
    """
    hyperhypenym: arthropod.n.01,
    hypernym: insect.n.01, hyponyms:
    (blackly)

    hyperhypenym: elem['grandparents'],
    hypernym: elem['parents'], hyponyms:
    elem['children']

    Fly is a hyponym for the word “insect".
    Predict hyponyms for the word “fly”. Answer:
    """

    # transformed_term = (
    #     "hyperhypenym: "
    #     + ", ".join(elem["grandparents"])
    #     + ", hypernym: "
    #     + elem["parents"]
    #     + ", hyponyms:"
    # )
    transformed_term = (
        ", ".join(elem["grandparents"])
        + " are hyponyms for the word '"
        + elem["parents"]
        + "'. Predict hyponyms for the word '"
        + elem["parents"]
        + "'. Answer:"
    )
    return transformed_term