In [1]:
import networkx as nx
from tqdm import tqdm
import random
from leafer import Leafer
import numpy as np
import pickle
import os
import glob

In [2]:
# G = nx.read_edgelist("./IsA_Graph2.edgelist", delimiter=" ", create_using=nx.DiGraph)
G = nx.read_edgelist("../data/omw/ru_isa.edgelist", delimiter="\t", create_using=nx.DiGraph)

In [3]:
l = Leafer(G)
# iterator = l.leafs_generator()

In [7]:
train, test = l.split_train_test(
    generation_depth=3,  # до какого уровня в топ. сортировке идти
    p=0.1,  # вероятность что подходящий случай уйдет в тест
    p_divide_leafs=0.5,
    # вероятность что листья поделим пополам трейн-тест
    # а не засунем целый случай в трейн или в тест
    min_to_test_rate=0.5,
    # минимальное количество доли вершин которых не было в
    # трейне чтобы поделить пополам на трейн-тест
    # то есть если 6\10 вершин были трейне то значит все 10 в трейн
    # если 5\10 были в трейне, то значит оставшиеся можем кинуть в тест
    weights=[0.01, 0.49, 0.4, 0.05, 0.05],
    # веса в соответствии
    # один ребенок, только листья, не только листья
    # триплеты с 2 родителями, триплеты такие что мидл нода имеет
    # 1 ребенка
)

In [8]:
len(train), len(test)

(25271, 1820)

In [9]:
ls = [elem for elem in test if elem["case"] == "only_leafs_divided"]
len(ls)

116

In [10]:
train_count = {}
for elem in train:
    if elem["case"] in train_count.keys():
        train_count[elem["case"]] += 1

    else:
        train_count[elem["case"]] = 1

test_count = {}
for elem in test:
    if elem["case"] in test_count.keys():
        test_count[elem["case"]] += 1

    else:
        test_count[elem["case"]] = 1

In [11]:
train_count, test_count

({'leafs_and_no_leafs': 2424,
  'only_leafs_divided': 814,
  'only_child_leaf': 4368,
  'simple_triplet_grandparent': 4476,
  'only_leafs_all': 1052,
  'simple_triplet_2parent': 12137},
 {'only_leafs_all': 77,
  'only_leafs_divided': 116,
  'leafs_and_no_leafs': 134,
  'simple_triplet_2parent': 678,
  'simple_triplet_grandparent': 396,
  'only_child_leaf': 419})

In [13]:
test[55:65]

[{'children': ['закрутить гайки.n.1', 'усугубить.n.1', 'форсировать.n.1'],
  'parents': 'усилить.n.2',
  'grandparents': None,
  'case': 'leafs_and_no_leafs'},
 {'children': ['засада.n.1', 'капкан.n.1'],
  'parents': 'ловушка.n.1',
  'grandparents': None,
  'case': 'only_leafs_all'},
 {'children': ['сетовать.n.1', 'ныть.n.1'],
  'parents': 'жаловаться.n.2',
  'grandparents': None,
  'case': 'only_leafs_all'},
 {'children': 'бушмены.n.1',
  'parents': 'негры.n.1',
  'grandparents': 'человек.n.1',
  'case': 'simple_triplet_grandparent'},
 {'children': ['биотехнолог.n.1'],
  'parents': 'микробиолог.n.1',
  'grandparents': None,
  'case': 'only_leafs_divided',
  'brothers': ['бактериолог.n.1']},
 {'children': ['пробка для бутылки.n.1', 'заглушка.n.1', 'тампон.n.1'],
  'parents': 'затычка.n.1',
  'grandparents': None,
  'case': 'leafs_and_no_leafs'},
 {'children': ['игровая приставка.n.1'],
  'parents': 'игровое устройство.n.1',
  'grandparents': None,
  'case': 'only_leafs_divided',
  'bro

In [34]:
num_leaks = 0
for vertex in l.collector.test_verteces:
    if vertex in l.collector.train_verteces:
        num_leaks += 1

In [23]:
num_leaks

0

In [24]:
num_leaks = 0
for vertex in l.collector.train_verteces:
    if vertex in l.collector.test_verteces:
        num_leaks += 1

In [25]:
num_leaks

0

In [24]:
lang = 'en'
name_train = "../babel_datasets/reweighted_wnet_train_" + lang + "_babel.pickle"
name_test = "../babel_datasets/reweighted_wnet_test_" + lang + "_babel.pickle"

with open(name_train, "wb") as handle:
    pickle.dump(train, handle)

with open(name_test, "wb") as handle:
    pickle.dump(test, handle)

In [15]:
all_data = glob.glob(os.path.join("../data/omw/", "*isa.edgelist"))
all_data

['../data/omw/ita_isa.edgelist',
 '../data/omw/ru_isa.edgelist',
 '../data/omw/fra_isa.edgelist',
 '../data/omw/eng_isa.edgelist',
 '../data/omw/spa_isa.edgelist']

In [17]:
global_train = []
global_test = []

all_data = glob.glob(os.path.join("../data/omw/", "*isa.edgelist"))
for path in tqdm(all_data):
    lang = path.replace("../data/omw/", "").replace("_isa.edgelist", "")

    G = nx.read_edgelist(path, delimiter="\t", create_using=nx.DiGraph)
    l = Leafer(G)
    train, test = l.split_train_test(
        generation_depth=3,  # до какого уровня в топ. сортировке идти
        p=0.05,  # вероятность что подходящий случай уйдет в тест
        p_divide_leafs=0.5,
        min_to_test_rate=0.5,
        weights=[0.01, 0.49, 0.4, 0.05, 0.05],
    )

        
    print(lang, len(train), len(test))

    random.shuffle(train)
    random.shuffle(test)

    name_train = "../../omw_datasets/train_" + lang + ".pickle"
    name_test = "../../omw_datasets/test_" + lang + ".pickle"

    with open(name_train, "wb") as handle:
        pickle.dump(train, handle)

    with open(name_test, "wb") as handle:
        pickle.dump(test, handle)

    global_train.extend(train)
    global_test.extend(test)

name_train = "../../omw_datasets/global_train.pickle"
name_test = "../../omw_datasets/global_test.pickle"


random.shuffle(global_train)
with open(name_train, "wb") as handle:
    pickle.dump(global_train, handle)

with open(name_test, "wb") as handle:
    pickle.dump(global_test, handle)

  0%|          | 0/5 [00:00<?, ?it/s]

ita 538158 3675


 40%|████      | 2/5 [02:52<03:37, 72.51s/it] 

ru 26006 965
fra 136390 3307


 60%|██████    | 3/5 [03:32<01:55, 57.54s/it]

eng 324610 11118


100%|██████████| 5/5 [05:23<00:00, 64.70s/it]

spa 55235 1242





In [52]:
name_train = "../babel_datasets/wnet_only/global_train_babel.pickle"
name_test = "../babel_datasets/wnet_only/global_test_babel.pickle"

with open(name_train, "wb") as handle:
    pickle.dump(global_train, handle)

with open(name_test, "wb") as handle:
    pickle.dump(global_test, handle)

In [48]:
len(global_test), len(global_train)

(1980, 38808)

In [49]:
max_len = 0
k = 0
for vert in global_train:
    cur_len = len(vert["children"])
    if cur_len > 50:
        k += 1
        max_len = cur_len
        max_i = vert

In [51]:
max_i

{'children': 'conditions_normales_de_température_et_de_pression.n.1',
 'parents': ['atmosphère.n.1', 'température_standard.n.1'],
 'grandparents': None,
 'case': 'simple_triplet_2parent'}

In [17]:
def predict_child_with_parent_and_grandparent(elem):
    """
    hyperhypenym: arthropod.n.01,
    hypernym: insect.n.01, hyponyms:
    (blackly)

    hyperhypenym: elem['grandparents'],
    hypernym: elem['parents'], hyponyms:
    elem['children']

    Fly is a hyponym for the word “insect".
    Predict hyponyms for the word “fly”. Answer:
    """

    # transformed_term = (
    #     "hyperhypenym: "
    #     + ", ".join(elem["grandparents"])
    #     + ", hypernym: "
    #     + elem["parents"]
    #     + ", hyponyms:"
    # )
    transformed_term = (
        ", ".join(elem["grandparents"])
        + " are hyponyms for the word '"
        + elem["parents"]
        + "'. Predict hyponyms for the word '"
        + elem["parents"]
        + "'. Answer:"
    )
    return transformed_term