In [1]:
import pandas as pd
import networkx as nx
from tqdm import tqdm
import random
from leafer import Leafer
import numpy as np
import pickle
import os
import glob


seed = 42
random.seed(seed)
np.random.seed(seed)

In [25]:

class EnrichMeanReciprocalRank:
    """
     Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """

    def __init__(self):
        pass

    def __call__(self, pred_hyps, gold_hyps, r, *args):

        mean_mrr = 0
        for gold in gold_hyps:
            if gold in pred_hyps:
                rank = pred_hyps.index(gold)
                lefter_positive = sum(r[:rank])
                mean_mrr += 1/(rank + 1 - lefter_positive)

        return mean_mrr / len(gold_hyps)

    def __str__(self):
        return "MRR"

In [27]:
limit = 15
golds = ['a', 'b']
preds = ['a', 'c', 'b', 'd']

r = [0 for i in range(limit)]

for j in range(min(len(preds), limit)):
    pred_hyp = preds[j]
    if pred_hyp in golds:
        r[j] = 1

metric = EnrichMeanReciprocalRank()
metric(preds, golds, r)

[] 0
a
[1, 0] 1
b


0.75

In [11]:
preds[:1]

['a']

In [34]:


total_test = []

subsets = ['1A.english', '2A.medical', '2B.music']
for subset in subsets:

    data_path = '../../SemEval2018-Task9/test/data/' + subset + '.test.data.txt'
    gold_path = '../../SemEval2018-Task9/test/gold/' + subset + '.test.gold.txt'
    train_data_en_data = pd.read_csv(
                    data_path, header=None, sep="\t", names=["term", "relation"]
                )
    train_gold_en_data = pd.read_csv(gold_path, header=None, names=["hypernym"])

    df = pd.concat([train_data_en_data, train_gold_en_data], axis=1)[
        ["term", "hypernym"]
    ]

    test = []

    for elem in df.iterrows():
        idx, row = elem

        elem = {}
        elem["children"] = row['term']
        elem["parents"] = row['hypernym'].split('\t')
        elem["grandparents"] = None
        elem["case"] = "predict_multiple_hypernyms"

        test.append(elem)

    total_test += test

    test_name = '../../SemEval2018-Task9/custom_datasets/' + subset + '.pickle'

    # with open(test_name, 'wb') as f:
    #     pickle.dump(test, f)


In [6]:
test_name = '../../SemEval2018-Task9/custom_datasets/test_it.pickle'

with open(test_name, 'wb') as f:
    pickle.dump(test, f)

In [57]:
# data_path = '../../SemEval2018-Task9/training/data/1A.english.training.data.txt'
# gold_path = '../../SemEval2018-Task9/training/gold/1A.english.training.gold.txt'

# train_data_en_data = pd.read_csv(
#                 data_path, header=None, sep="\t", names=["term", "relation"]
#             )
# train_gold_en_data = pd.read_csv(gold_path, header=None, names=["hypernym"])

# df = pd.concat([train_data_en_data, train_gold_en_data], axis=1)[
#     ["term", "hypernym"]
# ]

total_train = []

subsets = ['1A.english', '2A.medical', '2B.music']
for subset in subsets:

    data_path = '../../SemEval2018-Task9/training/data/' + subset + '.training.data.txt'
    gold_path = '../../SemEval2018-Task9/training/gold/' + subset + '.training.gold.txt'
    print(data_path)
    train_data_en_data = pd.read_csv(
                    data_path, header=None, sep="\t", names=["term", "relation"]
                )
    train_gold_en_data = pd.read_csv(gold_path, header=None, names=["hypernym"])

    df = pd.concat([train_data_en_data, train_gold_en_data], axis=1)[
        ["term", "hypernym"]
    ]

    train = []

    for elem in df.iterrows():
        idx, row = elem

        elem = {}
        elem["children"] = row['term']
        elem["parents"] = row['hypernym'].split('\t')
        elem["grandparents"] = None
        elem["case"] = "predict_multiple_hypernyms"

        train.append(elem)

    total_train += train

    train_name = '../../SemEval2018-Task9/custom_datasets/' + subset + '_train.pickle'

    with open(train_name, 'wb') as f:
        pickle.dump(train, f)


../../SemEval2018-Task9/training/data/1A.english.training.data.txt
../../SemEval2018-Task9/training/data/2A.medical.training.data.txt
../../SemEval2018-Task9/training/data/2B.music.training.data.txt


In [58]:
with open('/home/LLM_Taxonomy/SemEval2018-Task9/custom_datasets/2A.medical_train.pickle', 'rb') as f:
    train = pickle.load(f)

In [59]:
import sys

sys.path.append('../../pipeline_src/')
from dataset.prompt_schemas import predict_multiple_parents_from_child

In [60]:
for obj in train:
    predict_multiple_parents_from_child(obj)

In [50]:
obj

{'children': nan,
 'parents': ['function', 'renal function', 'blood flow', 'flow'],
 'grandparents': None,
 'case': 'predict_multiple_hypernyms'}

In [55]:
subset = '2A.medical'
data_path = '../../SemEval2018-Task9/training/data/' + subset + '.training.data.txt'

train_data_en_data = pd.read_csv(
                data_path, header=None, sep="\t", names=["term", "relation"]
            )

In [56]:
train_data_en_data

Unnamed: 0,term,relation
0,bone spur,Concept
1,endodontics,Concept
2,recurrent cancer,Concept
3,neurohypophyseal diabetes insipidus,Concept
4,fixed orthodontic appliance,Concept
...,...,...
495,continuous blood sampling,Concept
496,acute respiratory infection,Concept
497,primary pulmonary lymphoma,Concept
498,psychoactive substance dependence,Concept


In [27]:
train_name = '../../SemEval2018-Task9/custom_datasets/train.pickle'

with open(train_name, 'wb') as f:
    pickle.dump(train, f)

In [4]:
G = nx.read_edgelist("../data/only_en_wordnet.edgelist", delimiter="\t", create_using=nx.DiGraph)

In [5]:
l = Leafer(G)


In [6]:
train_wnet, test_wnet = l.split_train_test(
    generation_depth=3,  # до какого уровня в топ. сортировке идти
    p=0.0,  # вероятность что подходящий случай уйдет в тест
    p_divide_leafs=0.5,
    # вероятность что листья поделим пополам трейн-тест
    # а не засунем целый случай в трейн или в тест
    min_to_test_rate=0.5,
    # минимальное количество доли вершин которых не было в
    # трейне чтобы поделить пополам на трейн-тест
    # то есть если 6\10 вершин были трейне то значит все 10 в трейн
    # если 5\10 были в трейне, то значит оставшиеся можем кинуть в тест
    weights=[0.00, 0.0, 0.0, 0.00, 0.00, 1.],
    # веса в соответствии
    # один ребенок, только листья, не только листья
    # триплеты с 2 родителями, триплеты такие что мидл нода имеет
    # 1 ребенка,
    #p_parent=1
)

Parent
Child
WIKI:EN:daylight_saving_time
WIKI:EN:capillary_hemangioma
predict_hypernym 40406 40406


In [8]:
test_path = '../../SemEval2018-Task9/custom_datasets/1A.english.pickle'
with open(test_path, 'rb') as f:
    test = pickle.load(f)

In [9]:
test_verteces = [elem['children'] for elem in test]

In [13]:
counter = 0
for elem in train_wnet:
    if elem['children'].split('.')[0] in test_verteces:
        counter += 1
        train_wnet.remove(elem)

In [14]:
counter

246

In [None]:
train_augmented_name = '../../SemEval2018-Task9/custom_datasets/only_wnet_train.pickle'

with open(train_augmented_name, 'wb') as f:
    pickle.dump(train_wnet, f)

In [31]:
train_augmented = train_wnet + train

In [32]:
random.shuffle(train_augmented)

In [33]:
train_augmented_name = '../../SemEval2018-Task9/custom_datasets/train_with_wnet_with_subsets.pickle'

with open(train_augmented_name, 'wb') as f:
    pickle.dump(train_augmented, f)