In [2]:
from scorer import GPTHypernymySuiteModel, HFLMScorer
import pandas as pd
import numpy as np
import os
from hypernymysuite.evaluation import all_evaluations
from tqdm import tqdm
import gensim
from leven import levenshtein
from nltk.corpus import wordnet as wn

In [3]:
def print_res_table(res, return_mean=False):
    metrics = []
    metrics.append(res['siege_bless']['other']['ap_test_inv'])
    metrics.append(res['siege_eval']['other']['ap_test_inv'])
    metrics.append(res['siege_leds']['other']['ap_test_inv'])
    metrics.append(res['siege_shwartz']['other']['ap_test_inv'])
    metrics.append(res['siege_weeds']['other']['ap_test_inv'])

    metrics.append(res['dir_dbless']['acc_test_inv'])
    metrics.append(res['dir_wbless']['acc_test_inv'])
    metrics.append(res['dir_bibless']['acc_test_inv'])

    metrics.append(res['cor_hyperlex']['rho_test_inv'])
    mean = np.mean(metrics)
    metrics.append(mean)
    metrics = [f'{val:.2f}'.replace('.', ',') for val in metrics]
    if return_mean:
        return ' '.join(metrics), mean
    return ' '.join(metrics)


In [4]:
model_name = 'gpt2-xl'
device = 'cuda'
scorer = HFLMScorer(model_name, device)

Downloading:   0%|          | 0.00/787 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [5]:
eval_data_dir = 'data'

In [6]:
vocab = {}
vocab['<OOV>'] = 1
for file_name in os.listdir(eval_data_dir):
    file_path = os.path.join(eval_data_dir, file_name)
    df = pd.read_csv(file_path, sep='\t')
    for w in df['word1']:
        vocab[w] = 1
    for w in df['word2']:
        vocab[w] = 1

## HYPERNYMY PATTERNS

In [7]:
PROMPTS = {
    'gen': ["<hyper> is more general than <hypo>"],
    'spec': ["<hypo> is more specific than <hyper>"],
    'type': ["<hypo> is a type of <hyper>"],
    'hyper1': ["<hypo> which is a (example|class|kind|. . . ) of <hyper>"],
    'hyper2': ["<hypo> which is a example of <hyper>"],
    'hyper3': ["<hypo> which is a class of <hyper>"],
    'hyper4': ["<hypo> which is a kind of <hyper>"],
    'hyper5': ["<hypo> which is a type of <hyper>"],
    'hyper6': ["<hypo> (and|or) (any|some) other <hyper>"],
    'hyper7': ["<hypo> and any other <hyper>"],
    'hyper8': ["<hypo> and some other <hyper>"],
    'hyper9': ["<hypo> or any other <hyper>"],
    'hyper10': ["<hypo> or some other <hyper>"],
    'hyper11': ["<hypo> which is called <hyper>"],
    'hyper12': ["<hypo> a special case of <hyper>"],
    'hyper13': ["<hypo> is an <hyper> that"],
    'hyper14': ["(Unlike|like) (most|all|any|other) <hyper>, <hypo>"],
    'hyper15': ["unlike most <hyper>, <hypo>"],
    'hyper16': ["unlike all <hyper>, <hypo>"],
    'hyper17': ["unlike any <hyper>, <hypo>"],
    'hyper18': ["unlike other <hyper>, <hypo>"],
    'hyper19': ["like most <hyper>, <hypo>"],
    'hyper20': ["like all <hyper>, <hypo>"],
    'hyper21': ["like any <hyper>, <hypo>"],
    'hyper22': ["like other <hyper>, <hypo>"],
    'hyper23': ["<hyper> including <hypo>"],
    'hyper24': ["such <hyper> as <hypo>"],
}

In [7]:
patterns = PROMPTS['hyper10'] + PROMPTS['hyper24']
print(patterns)
hs_model = GPTHypernymySuiteModel(scorer, patterns, vocab)

['<hypo> or some other <hyper>', 'such <hyper> as <hypo>']


In [11]:
res = all_evaluations(hs_model)
print_res_table(res)

100%|██████████| 53/53 [00:06<00:00,  7.61it/s]
100%|██████████| 53/53 [00:05<00:00,  9.19it/s]
100%|██████████| 53/53 [00:06<00:00,  7.79it/s]
100%|██████████| 53/53 [00:05<00:00,  9.36it/s]
100%|██████████| 53/53 [00:06<00:00,  7.65it/s]
100%|██████████| 53/53 [00:05<00:00,  9.36it/s]
100%|██████████| 42/42 [00:05<00:00,  7.49it/s]
100%|██████████| 42/42 [00:04<00:00,  8.91it/s]
100%|██████████| 42/42 [00:05<00:00,  7.38it/s]
100%|██████████| 42/42 [00:04<00:00,  8.91it/s]
100%|██████████| 68/68 [00:08<00:00,  7.90it/s]
100%|██████████| 68/68 [00:07<00:00,  9.63it/s]
100%|██████████| 452/452 [01:04<00:00,  7.06it/s]
100%|██████████| 452/452 [00:54<00:00,  8.28it/s]
100%|██████████| 87/87 [00:11<00:00,  7.26it/s]
100%|██████████| 87/87 [00:09<00:00,  8.70it/s]
100%|██████████| 421/421 [00:47<00:00,  8.79it/s]
100%|██████████| 421/421 [00:40<00:00, 10.29it/s]
100%|██████████| 53/53 [00:07<00:00,  7.51it/s]
100%|██████████| 53/53 [00:05<00:00,  8.91it/s]
100%|██████████| 1644/1644 [04:4

'0,53 0,37 0,86 0,47 0,89 0,96 0,75 0,71 0,62 0,68'

## COMBINED PATTERNS

In [8]:
ft_model_name = r'D:\WorkFolder\emb\cc.en.300.bin'
ft = gensim.models.FastText.load_fasttext_format(ft_model_name)

  ft = gensim.models.FastText.load_fasttext_format(ft_model_name)


In [37]:
ft = gensim.models.KeyedVectors.load_word2vec_format('ft_wn_eval_06_06_23.wv')

In [39]:
w2cands = {}
for w in tqdm(vocab):
    w2cands[w] = ft.most_similar(w, topn=100)

100%|██████████| 46973/46973 [06:13<00:00, 125.82it/s]


In [40]:
w2cands['crocodile']

[('alligator', 0.7467529773712158),
 ('snake', 0.641427218914032),
 ('hippo', 0.6413436532020569),
 ('caiman', 0.6291424632072449),
 ('lizard', 0.6144946813583374),
 ('hippopotamus', 0.6114556789398193),
 ('reptile', 0.6088621020317078),
 ('crocodilian', 0.5975377559661865),
 ('gator', 0.5892565250396729),
 ('shark', 0.5836324095726013),
 ('hyena', 0.5558685660362244),
 ('tiger', 0.5501530766487122),
 ('giraffe', 0.5469062328338623),
 ('stingray', 0.5343039035797119),
 ('ostrich', 0.529415488243103),
 ('platypus', 0.5281539559364319),
 ('frog', 0.5281187295913696),
 ('turtle', 0.5272694230079651),
 ('baboon', 0.5234726071357727),
 ('kangaroo', 0.5205118656158447),
 ('piranha', 0.5126773118972778),
 ('iguana', 0.5126669406890869),
 ('scorpion', 0.5103745460510254),
 ('elephant', 0.5072616934776306),
 ('eel', 0.5052809715270996),
 ('sharks', 0.5039006471633911),
 ('gavial', 0.5017820596694946),
 ('panther', 0.5000379085540771),
 ('python', 0.4993168115615845),
 ('dolphin', 0.497542232275

In [41]:
w2cands['alligator']

[('gator', 0.7823737263679504),
 ('crocodile', 0.746752917766571),
 ('caiman', 0.5879793167114258),
 ('reptile', 0.5848178267478943),
 ('snake', 0.5767461657524109),
 ('lizard', 0.5667709708213806),
 ('crocodilian', 0.5635765790939331),
 ('iguana', 0.5620105266571045),
 ('hippo', 0.5540482401847839),
 ('turtle', 0.5400021076202393),
 ('shark', 0.5371209383010864),
 ('frog', 0.5365213751792908),
 ('rattlesnake', 0.5361799001693726),
 ('stingray', 0.5354768633842468),
 ('giraffe', 0.5335032939910889),
 ('catfish', 0.5294216275215149),
 ('panther', 0.5294082164764404),
 ('manatee', 0.5258815288543701),
 ('octopus', 0.5207601189613342),
 ('hippopotamus', 0.5203745365142822),
 ('nutria', 0.520204484462738),
 ('armadillo', 0.5197222232818604),
 ('cottonmouth', 0.5195431113243103),
 ('ostrich', 0.5182366371154785),
 ('tarantula', 0.5140571594238281),
 ('eel', 0.5093511939048767),
 ('blacksnake', 0.509261965751648),
 ('raccoon', 0.504118800163269),
 ('opossum', 0.5039319396018982),
 ('pelican'

In [43]:
import json
import codecs

json.dump(w2cands, codecs.open('w2cands.json', 'w', 'utf-8'))

In [31]:
total_vocab = set()
total_vocab.update(vocab.keys())
len(total_vocab)

46973

In [32]:
'123'.isdigit()

True

In [33]:
total_vocab.update([w for w in wn._lemma_pos_offset_map if not w.isdigit() and '.' not in w])
len(total_vocab)

177501

In [35]:
w2v = {}
for w in tqdm(total_vocab):
    w2v[w] = ft.wv[w]

100%|██████████| 177501/177501 [00:02<00:00, 83338.89it/s]


In [36]:
import codecs
def save_vectors(wv, wv_path):
    with codecs.open(wv_path, 'w', 'utf-8') as file_descr:
        wv_size = wv[list(wv.keys())[0]].shape[0]

        nwords = len({w: wv[w] for w in wv if wv[w].shape[0] == wv_size})
        file_descr.write(f'{nwords} {wv_size}')
        for w in tqdm(wv):
            if wv[w].shape[0] != wv_size:
                continue
            vector = ' '.join([str(val) for val in wv[w]])
            file_descr.write(f'\n{w} {vector}')


save_vectors(w2v, 'ft_wn_eval_06_06_23.wv')


100%|██████████| 177501/177501 [00:23<00:00, 7449.51it/s]


### preprocessing candidates 1

In [56]:
w2cands_f = {}
for w in tqdm(w2cands):
    w2cands_f[w] = [[d[1], d[0]] for d in w2cands[w] if (d[0] not in w) and (w not in d[0]) and (levenshtein(w, d[0]) > (len(w) / 2))][:100]

100%|██████████| 46973/46973 [00:06<00:00, 7033.99it/s] 


### preprocessing candidates 2

In [57]:
for w in tqdm(w2cands_f):
    w2cands_f[w] = [d for d in w2cands_f[w] if '.' not in d[1] and d[1][0] != '-' and d[1][-1] != '-'][:100]

100%|██████████| 46973/46973 [00:00<00:00, 49445.26it/s]


### preprocessing candidates 3

In [58]:
for w in tqdm(w2cands_f):
    w2cands_f[w] = [d for d in w2cands_f[w] if len(wn.synsets(d[1])) > 0][:100]

100%|██████████| 46973/46973 [00:37<00:00, 1251.19it/s]


In [55]:
w2cands_f

{}

### candidates cohypo ranking

In [51]:
patterns = ['<hypo> or <hyper>']
cohypo_model = GPTHypernymySuiteModel(scorer, patterns, vocab)

In [54]:
len(w2cands_f)

0

In [53]:
#w2cands_f = {}
max_l = 100
for w in tqdm(w2cands_f):
    l = min(len(w2cands_f[w]), max_l)
    scores = cohypo_model.predict_many([w for i in range(l)], [d[1] for d in w2cands_f[w][:l]])
    w2cands_f[w] = sorted([[scores[i], w2cands_f[w][i][1]] for i in range(l)], key=lambda x: -x[0])

0it [00:00, ?it/s]


In [None]:
topk = 3
word2cohypos = {w: [d[1] for d in w2cands_f2[w][:topk]] for w in w2cands_f2}
word2cohypos