In [4]:
import os
import sys
import torch
import pandas as pd
from torch import nn
import numpy as np

sys.path.append("../NLP-DL-Project-hypo-to-hyper/pipeline_src/")

from config.config import TaskConfig
from dataset.dataset import init_data, HypernymDataset

from dataset.prompt_schemas import (
    hypo_term_hyper,
    predict_child_from_2_parents,
    predict_child_from_parent,
    predict_child_with_parent_and_grandparent,
    predict_children_with_parent_and_brothers,
    predict_parent_from_child_granparent,
)

device = "cpu"

SEED = 0
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
print(torch.cuda.device_count())

0


In [5]:
from transformers import AutoTokenizer, AutoConfig, LlamaTokenizer

In [6]:
test_path = "babel_datasets/test_en_babel.pickle"

df = pd.read_pickle(test_path)

transforms = {
            "only_child_leaf": predict_parent_from_child_granparent,
            "only_leafs_all": predict_child_from_parent,
            "only_leafs_divided": predict_children_with_parent_and_brothers,
            "leafs_and_no_leafs": predict_child_from_parent,
            "simple_triplet_grandparent": predict_parent_from_child_granparent,
            "simple_triplet_2parent": predict_child_from_2_parents,
            }

In [16]:
def get_stats(df):
    uniq = set()
    cases = {"only_child_leaf": 0,
                "only_leafs_all": 0,
                "only_leafs_divided": 0,
                "leafs_and_no_leafs": 0,
                "simple_triplet_grandparent": 0,
                "simple_triplet_2parent": 0
            }
    for item in df:
        cases[item['case']] += 1
        for k in ['children', "parents", "grandparents"]:
            if isinstance(item[k], list):
                for i in item[k]:
                    uniq.add(i)
            else:
                if item[k]:
                    uniq.add(item[k])

    return uniq, cases

In [20]:
for language in ["en", "de", "es", "fr", "it", "ru"]:
    print("Language: ", language)
    test_path = f"babel_datasets/test_{language}_babel.pickle"
    train_path = f"babel_datasets/train_{language}_babel.pickle"

    train_df = pd.read_pickle(train_path)
    test_df = pd.read_pickle(test_path)

    train_uniq, train_cases = get_stats(train_df)
    test_uniq, test_cases = get_stats(test_df)

    print("Train uniques: ", len(train_uniq))
    print("Train Stats: ")
    print(train_cases)

    print("Test uniques: ", len(test_uniq))
    print("Test Stats: ")
    print(test_cases)
    
    print('====================')

Language:  en
Train uniques:  61036
Train Stats: 
{'only_child_leaf': 5337, 'only_leafs_all': 2707, 'only_leafs_divided': 2534, 'leafs_and_no_leafs': 4307, 'simple_triplet_grandparent': 6055, 'simple_triplet_2parent': 3500}
Test uniques:  3264
Test Stats: 
{'only_child_leaf': 304, 'only_leafs_all': 129, 'only_leafs_divided': 135, 'leafs_and_no_leafs': 190, 'simple_triplet_grandparent': 331, 'simple_triplet_2parent': 122}
Language:  de
Train uniques:  25686
Train Stats: 
{'only_child_leaf': 2254, 'only_leafs_all': 1184, 'only_leafs_divided': 1137, 'leafs_and_no_leafs': 1427, 'simple_triplet_grandparent': 1895, 'simple_triplet_2parent': 3385}
Test uniques:  1427
Test Stats: 
{'only_child_leaf': 118, 'only_leafs_all': 52, 'only_leafs_divided': 58, 'leafs_and_no_leafs': 67, 'simple_triplet_grandparent': 93, 'simple_triplet_2parent': 134}
Language:  es
Train uniques:  32129
Train Stats: 
{'only_child_leaf': 2926, 'only_leafs_all': 1324, 'only_leafs_divided': 1190, 'leafs_and_no_leafs': 2112

In [21]:
for language in ["en"]:
    print("Language: ", language)
    test_path = f"babel_datasets/wnet_test_{language}_babel.pickle"
    train_path = f"babel_datasets/wnet_train_{language}_babel.pickle"

    train_df = pd.read_pickle(train_path)
    test_df = pd.read_pickle(test_path)

    train_uniq, train_cases = get_stats(train_df)
    test_uniq, test_cases = get_stats(test_df)

    print("Train uniques: ", len(train_uniq))
    print("Train Stats: ")
    print(train_cases)

    print("Test uniques: ", len(test_uniq))
    print("Test Stats: ")
    print(test_cases)
    
    print('====================')

Language:  en
Train uniques:  39925
Train Stats: 
{'only_child_leaf': 4093, 'only_leafs_all': 1616, 'only_leafs_divided': 2306, 'leafs_and_no_leafs': 2646, 'simple_triplet_grandparent': 3533, 'simple_triplet_2parent': 1343}
Test uniques:  2296
Test Stats: 
{'only_child_leaf': 213, 'only_leafs_all': 75, 'only_leafs_divided': 114, 'leafs_and_no_leafs': 129, 'simple_triplet_grandparent': 193, 'simple_triplet_2parent': 72}
