In [1]:
import networkx as nx
from tqdm import tqdm
import random
from leafer import Leafer
import numpy as np
import pickle
import os
import glob


seed = 42
random.seed(seed)
np.random.seed(seed)

In [2]:
# G = nx.read_edgelist("./IsA_Graph2.edgelist", delimiter=" ", create_using=nx.DiGraph)
G = nx.read_edgelist("../data/only_en_wordnet.edgelist", delimiter="\t", create_using=nx.DiGraph)

In [3]:
l = Leafer(G)
# iterator = l.leafs_generator()

In [4]:
train, test = l.split_train_test(
    generation_depth=3,  # до какого уровня в топ. сортировке идти
    p=0.05,  # вероятность что подходящий случай уйдет в тест
    p_divide_leafs=0.5,
    # вероятность что листья поделим пополам трейн-тест
    # а не засунем целый случай в трейн или в тест
    min_to_test_rate=0.5,
    # минимальное количество доли вершин которых не было в
    # трейне чтобы поделить пополам на трейн-тест
    # то есть если 6\10 вершин были трейне то значит все 10 в трейн
    # если 5\10 были в трейне, то значит оставшиеся можем кинуть в тест
    weights=[0.01, 0.4, 0.49, 0.05, 0.05],
    # веса в соответствии
    # один ребенок, только листья, не только листья
    # триплеты с 2 родителями, триплеты такие что мидл нода имеет
    # 1 ребенка,
    p_parent=0.8
)

Parent
Child
WIKI:EN:daylight_saving_time
WIKI:EN:capillary_hemangioma
only_leafs_all 2498 2498
leafs_and_no_leafs 2455 2455
only_leafs_divided 2406 2406
predict_hypernym 8607 1338
simple_triplet_2parent 1461 1461
simple_triplet_grandparent 5283 5154
only_child_leaf 5055 4924
only_leafs_divided 117 117
predict_hypernym 439 364
simple_triplet_grandparent 270 264
leafs_and_no_leafs 115 115
only_leafs_all 110 110
simple_triplet_2parent 47 47
only_child_leaf 261 257


In [5]:
len(train), len(test)

(20236, 1274)

In [6]:
train_count = {}
for elem in train:
    if elem["case"] in train_count.keys():
        train_count[elem["case"]] += 1

    else:
        train_count[elem["case"]] = 1

test_count = {}
for elem in test:
    if elem["case"] in test_count.keys():
        test_count[elem["case"]] += 1

    else:
        test_count[elem["case"]] = 1

In [7]:
train_count, test_count

({'only_child_leaf': 9430,
  'only_leafs_all': 2498,
  'simple_triplet_2parent': 1461,
  'only_leafs_divided': 2406,
  'leafs_and_no_leafs': 2455,
  'predict_hypernym': 1338,
  'simple_triplet_grandparent': 648},
 {'predict_hypernym': 364,
  'leafs_and_no_leafs': 115,
  'only_leafs_divided': 117,
  'only_leafs_all': 110,
  'only_child_leaf': 486,
  'simple_triplet_grandparent': 35,
  'simple_triplet_2parent': 47})

In [8]:
num_leaks = 0
for vertex in l.collector.test_verteces:
    if vertex in l.collector.train_verteces:
        num_leaks += 1

In [9]:
num_leaks

0

In [10]:
num_leaks = 0
for vertex in l.collector.train_verteces:
    if vertex in l.collector.test_verteces:
        num_leaks += 1

In [11]:
num_leaks

0

In [14]:
lang = 'en'
name_train = "../../babel_datasets/v2_wnet_train.pickle"
name_test = "../../babel_datasets/v2_wnet_test.pickle"

with open(name_train, "wb") as handle:
    pickle.dump(train, handle)

with open(name_test, "wb") as handle:
    pickle.dump(test, handle)

In [17]:
test[58]

{'children': ['narcoleptic.n.2', 'sleeping_pill.n.1'],
 'parents': 'soporific.n.1',
 'grandparents': None,
 'case': 'only_leafs_all'}

In [12]:
import sys
sys.path.append('../../pipeline_src')
from dataset.prompt_schemas import predict_parent_from_child

In [19]:
elem = dict(test[58])

In [13]:
for elem in train:
    if elem['case'] == 'predict_hypernym':
        print(elem)
        predict_parent_from_child(elem)

{'children': 'homo.n.2', 'parents': 'hominid.n.1', 'grandparents': None, 'case': 'predict_hypernym'}
{'children': 'subordinate_clause.n.1', 'parents': 'clause.n.1', 'grandparents': None, 'case': 'predict_hypernym'}
{'children': 'Sioux.n.1', 'parents': 'Plains_Indian.n.1', 'grandparents': None, 'case': 'predict_hypernym'}
{'children': 'cathode.n.1', 'parents': 'electrode.n.1', 'grandparents': None, 'case': 'predict_hypernym'}
{'children': 'contamination.n.3', 'parents': 'soiling.n.1', 'grandparents': None, 'case': 'predict_hypernym'}
{'children': 'gruel.n.1', 'parents': 'porridge.n.1', 'grandparents': None, 'case': 'predict_hypernym'}
{'children': 'spelling.n.1', 'parents': 'orthography.n.1', 'grandparents': None, 'case': 'predict_hypernym'}
{'children': 'van.n.5', 'parents': 'truck.n.1', 'grandparents': None, 'case': 'predict_hypernym'}
{'children': 'avitaminosis.n.1', 'parents': 'malnutrition.n.1', 'grandparents': None, 'case': 'predict_hypernym'}
{'children': 'megapode.n.1', 'parents

In [34]:
def clean_elem(elem, keys_to_remove_digits=["children"]):
    removes = set(keys_to_remove_digits)
    print(removes)
    if not "changed" in elem.keys():
        for field in ["children", "parents", "grandparents", "brothers"]:
            if field in elem.keys():
                print(elem[field], field)
                elem[field] = delete_techniqal(elem[field], remove=(field in removes))
                elem["changed"] = True
    return elem


def delete_techniqal(elem, remove):
    if isinstance(elem, str):
        if ".n." in elem and remove:
            return elem.split(".")[0].replace("_", " ")
        else:
            return elem.replace("_", " ")

    elif isinstance(elem, list):
        new_words = []
        for word in elem:
            new_words.append(delete_techniqal(word, remove))
        return new_words

def predict_parent_from_child(elem):
    """
    Predict the hypernym for the word “spaniel” which is hyponyms for the
    word “hunting dog” at the same time. Answer: (sporting dog)
    """
    clean = clean_elem(elem, keys_to_remove_digits=["parents"])
    # transformed_term = (
    #     "Predict the hypernym for the word '"
    #     + clean["children"]
    #     + "' which is hyponyms for the word '"
    #     + clean["grandparents"]
    #     + "' at the same time. Answer:"
    # )
    transformed_term = (
        "hyponym: "
        + clean["children"]
        + " | hypernym:"
    )
    #print(clean)
    return transformed_term, clean["parents"]



In [39]:
elem = dict(test[58])
predict_parent_from_child(elem)

('hyponym: Acadian.n.1 | hypernym:', 'French Canadian')

In [40]:
elem

{'children': 'exile.n.1',
 'parents': 'absentee',
 'grandparents': None,
 'case': 'predict_hypernym',
 'changed': True}

In [15]:
all_data = glob.glob(os.path.join("../data/omw/", "*isa.edgelist"))
all_data

['../data/omw/ita_isa.edgelist',
 '../data/omw/ru_isa.edgelist',
 '../data/omw/fra_isa.edgelist',
 '../data/omw/eng_isa.edgelist',
 '../data/omw/spa_isa.edgelist']

In [17]:
global_train = []
global_test = []

all_data = glob.glob(os.path.join("../data/omw/", "*isa.edgelist"))
for path in tqdm(all_data):
    lang = path.replace("../data/omw/", "").replace("_isa.edgelist", "")

    G = nx.read_edgelist(path, delimiter="\t", create_using=nx.DiGraph)
    l = Leafer(G)
    train, test = l.split_train_test(
        generation_depth=3,  # до какого уровня в топ. сортировке идти
        p=0.05,  # вероятность что подходящий случай уйдет в тест
        p_divide_leafs=0.5,
        min_to_test_rate=0.5,
        weights=[0.01, 0.49, 0.4, 0.05, 0.05],
    )

        
    print(lang, len(train), len(test))

    random.shuffle(train)
    random.shuffle(test)

    name_train = "../../omw_datasets/train_" + lang + ".pickle"
    name_test = "../../omw_datasets/test_" + lang + ".pickle"

    with open(name_train, "wb") as handle:
        pickle.dump(train, handle)

    with open(name_test, "wb") as handle:
        pickle.dump(test, handle)

    global_train.extend(train)
    global_test.extend(test)

name_train = "../../omw_datasets/global_train.pickle"
name_test = "../../omw_datasets/global_test.pickle"


random.shuffle(global_train)
with open(name_train, "wb") as handle:
    pickle.dump(global_train, handle)

with open(name_test, "wb") as handle:
    pickle.dump(global_test, handle)

  0%|          | 0/5 [00:00<?, ?it/s]

ita 538158 3675


 40%|████      | 2/5 [02:52<03:37, 72.51s/it] 

ru 26006 965
fra 136390 3307


 60%|██████    | 3/5 [03:32<01:55, 57.54s/it]

eng 324610 11118


100%|██████████| 5/5 [05:23<00:00, 64.70s/it]

spa 55235 1242





In [52]:
name_train = "../babel_datasets/wnet_only/global_train_babel.pickle"
name_test = "../babel_datasets/wnet_only/global_test_babel.pickle"

with open(name_train, "wb") as handle:
    pickle.dump(global_train, handle)

with open(name_test, "wb") as handle:
    pickle.dump(global_test, handle)

In [48]:
len(global_test), len(global_train)

(1980, 38808)

In [49]:
max_len = 0
k = 0
for vert in global_train:
    cur_len = len(vert["children"])
    if cur_len > 50:
        k += 1
        max_len = cur_len
        max_i = vert

In [51]:
max_i

{'children': 'conditions_normales_de_température_et_de_pression.n.1',
 'parents': ['atmosphère.n.1', 'température_standard.n.1'],
 'grandparents': None,
 'case': 'simple_triplet_2parent'}

In [17]:
def predict_child_with_parent_and_grandparent(elem):
    """
    hyperhypenym: arthropod.n.01,
    hypernym: insect.n.01, hyponyms:
    (blackly)

    hyperhypenym: elem['grandparents'],
    hypernym: elem['parents'], hyponyms:
    elem['children']

    Fly is a hyponym for the word “insect".
    Predict hyponyms for the word “fly”. Answer:
    """

    # transformed_term = (
    #     "hyperhypenym: "
    #     + ", ".join(elem["grandparents"])
    #     + ", hypernym: "
    #     + elem["parents"]
    #     + ", hyponyms:"
    # )
    transformed_term = (
        ", ".join(elem["grandparents"])
        + " are hyponyms for the word '"
        + elem["parents"]
        + "'. Predict hyponyms for the word '"
        + elem["parents"]
        + "'. Answer:"
    )
    return transformed_term