In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm

def transform_mask(raw_mask):
    c_true_mask = raw_mask.copy()
    c_true_mask[0] = 0
    c_true_mask[c_true_mask.sum()] = 0
    c_true_mask = c_true_mask.astype(bool)
    
    return c_true_mask


class HFLMScorer():
    def __init__(self, model_name, device='cpu'):
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        self.model.eval()

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
    
    def score_batch(self, batch):
        input = self.tokenizer.batch_encode_plus([self.tokenizer.eos_token + s + self.tokenizer.eos_token for s in batch], padding=True, return_tensors='pt')
        ids_np = input['input_ids'].detach().numpy()
        ids = input['input_ids'].to(self.model.device)
        mask = input['attention_mask'].numpy()
        with torch.no_grad():
            r = self.model(ids)[0]
            r = torch.nn.LogSoftmax(dim=-1)(r).cpu().detach().numpy()

        scores = []
        for ci in range(r.shape[0]):
            c_true_mask = transform_mask(mask[ci])
            score = r[ci, range(c_true_mask.sum()), ids_np[ci][c_true_mask]].sum()
            scores.append(score)

        return scores

    def score_sentences(self, sentences, split_size=32):
        batch_count = len(sentences) // split_size + int(len(sentences) % split_size != 0)
        scores = []
        for i in tqdm(range(batch_count)):
            scores += self.score_batch(sentences[i * split_size: (i + 1) * split_size])
        return scores

In [2]:
from hypernymysuite.evaluation import all_evaluations
from hypernymysuite.base import HypernymySuiteModel
import os
import pandas as pd
import numpy as np


class GPTHypernymySuiteModel(HypernymySuiteModel):
    def __init__(self, model, patterns, eval_data_dir):
        #super(GPTHypernymySuiteModel, self).__init__()
        self.model = model
        self.patterns = patterns
        self.vocab['<OOV>'] = 1
        for file_name in os.listdir(eval_data_dir):
            file_path = os.path.join(eval_data_dir, file_name)
            df = pd.read_csv(file_path, sep='\t')
            for w in df['word1']:
                self.vocab[w] = 1
            for w in df['word2']:
                self.vocab[w] = 1

    def predict(self, hypo, hyper):
        all_res = []
        for pattern in self.patterns:
            res = self.model.score_sentences([self.generate_sentence(pattern, hypo, hyper)])
            all_res.append(res[0])
        return np.mean(all_res)

    def predict_many(self, hypos, hypers):
        all_res = []
        for pattern in self.patterns:
            sentences = []
            for x, y in zip(hypos, hypers):
                sentences.append(self.generate_sentence(pattern, x, y))
            res = np.array(self.model.score_sentences(sentences))
            all_res.append(res)
        #print(all_res)
        return np.mean(all_res, axis=0)

    def generate_sentence(self, pattern, hypo, hyper):
        return pattern.replace('<hypo>', hypo).replace('<hyper>', hyper)
    
def print_res_table(res, return_mean=False):
    metrics = []
    metrics.append(res['siege_bless']['other']['ap_test_inv'])
    metrics.append(res['siege_eval']['other']['ap_test_inv'])
    metrics.append(res['siege_leds']['other']['ap_test_inv'])
    metrics.append(res['siege_shwartz']['other']['ap_test_inv'])
    metrics.append(res['siege_weeds']['other']['ap_test_inv'])

    metrics.append(res['dir_dbless']['acc_test_inv'])
    metrics.append(res['dir_wbless']['acc_test_inv'])
    metrics.append(res['dir_bibless']['acc_test_inv'])

    metrics.append(res['cor_hyperlex']['rho_test_inv'])
    mean = np.mean(metrics)
    metrics.append(mean)
    metrics = [f'{val:.2f}'.replace('.', ',') for val in metrics]
    if return_mean:
        return ' '.join(metrics), mean
    return ' '.join(metrics)



In [3]:
PROMPTS = {
    'gen': "<hyper> is more general than <hypo>",
    'spec': "<hypo> is more specific than <hyper>",
    'type': "<hypo> is a type of <hyper>",
    'hyper1': "<hypo> which is a (example|class|kind|. . . ) of <hyper>",
    'hyper2': "<hypo> which is a example of <hyper>",
    'hyper3': "<hypo> which is a class of <hyper>",
    'hyper4': "<hypo> which is a kind of <hyper>",
    'hyper5': "<hypo> which is a type of <hyper>",
    'hyper6': "<hypo> (and|or) (any|some) other <hyper>",
    'hyper7': "<hypo> and any other <hyper>",
    'hyper8': "<hypo> and some other <hyper>",
    'hyper9': "<hypo> or any other <hyper>",
    'hyper10': "<hypo> or some other <hyper>",
    'hyper11': "<hypo> which is called <hyper>",
    'hyper12': "<hypo> a special case of <hyper>",
    'hyper13': "<hypo> is an <hyper> that",
    'hyper14': "(Unlike|like) (most|all|any|other) <hyper>, <hypo>",
    'hyper15': "unlike most <hyper>, <hypo>",
    'hyper16': "unlike all <hyper>, <hypo>",
    'hyper17': "unlike any <hyper>, <hypo>",
    'hyper18': "unlike other <hyper>, <hypo>",
    'hyper19': "like most <hyper>, <hypo>",
    'hyper20': "like all <hyper>, <hypo>",
    'hyper21': "like any <hyper>, <hypo>",
    'hyper22': "like other <hyper>, <hypo>",
    'hyper23': "<hyper> including <hypo>",
    'hyper24': "such <hyper> as <hypo>"
}

In [4]:
patterns = ['<hypo> or some other <hyper>', '<hypo> or any other <hyper>']
scorer = HFLMScorer('gpt2', 'cuda')
hs_model = GPTHypernymySuiteModel(scorer, patterns, 'data')


  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [5]:
hs_model.predict_many(['cat', 'cat', 'cat'], ['animal', 'thing', 'dog'])

100%|██████████| 1/1 [00:00<00:00,  9.12it/s]
100%|██████████| 1/1 [00:00<00:00, 124.46it/s]

[array([-29.846237, -29.836548, -30.710709], dtype=float32), array([-26.224533, -28.336967, -26.962746], dtype=float32)]





array([-28.035385, -29.086758, -28.836727], dtype=float32)

In [7]:
0.5 * (-26.224533 + -29.846237)

-28.035384999999998

In [9]:
res = all_evaluations(hs_model)
print_res_table(res)

100%|██████████| 53/53 [00:01<00:00, 48.44it/s]
100%|██████████| 53/53 [00:01<00:00, 50.34it/s]
100%|██████████| 53/53 [00:01<00:00, 50.36it/s]
100%|██████████| 42/42 [00:00<00:00, 49.43it/s]
100%|██████████| 42/42 [00:00<00:00, 49.05it/s]
100%|██████████| 68/68 [00:01<00:00, 52.36it/s]
100%|██████████| 452/452 [00:09<00:00, 47.56it/s]
100%|██████████| 87/87 [00:01<00:00, 48.62it/s]
100%|██████████| 421/421 [00:07<00:00, 59.32it/s]
100%|██████████| 53/53 [00:01<00:00, 49.65it/s]
100%|██████████| 1644/1644 [00:41<00:00, 39.99it/s]


'0,38 0,35 0,81 0,47 0,83 0,91 0,72 0,64 0,35'

In [10]:
pattern = '<hypo> or some other <hyper>'
scorer = HFLMScorer('gpt2-medium', 'cuda')
hs_model = GPTHypernymySuiteModel(scorer, pattern, 'data')


  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [11]:
res = all_evaluations(hs_model)
print_res_table(res)

100%|██████████| 53/53 [00:02<00:00, 25.33it/s]
100%|██████████| 53/53 [00:02<00:00, 25.79it/s]
100%|██████████| 53/53 [00:02<00:00, 25.66it/s]
100%|██████████| 42/42 [00:01<00:00, 25.28it/s]
100%|██████████| 42/42 [00:01<00:00, 25.06it/s]
100%|██████████| 68/68 [00:02<00:00, 26.37it/s]
100%|██████████| 452/452 [00:18<00:00, 24.44it/s]
100%|██████████| 87/87 [00:03<00:00, 25.01it/s]
100%|██████████| 421/421 [00:13<00:00, 30.47it/s]
100%|██████████| 53/53 [00:02<00:00, 25.55it/s]
100%|██████████| 1644/1644 [01:18<00:00, 20.91it/s]


'0,42 0,34 0,82 0,44 0,85 0,92 0,71 0,64 0,46'

In [12]:
pattern = '<hypo> or some other <hyper>'
scorer = HFLMScorer('gpt2-large', 'cuda')
hs_model = GPTHypernymySuiteModel(scorer, pattern, 'data')


  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [13]:
res = all_evaluations(hs_model)
print_res_table(res)

100%|██████████| 53/53 [00:03<00:00, 14.90it/s]
100%|██████████| 53/53 [00:03<00:00, 15.07it/s]
100%|██████████| 53/53 [00:03<00:00, 14.92it/s]
100%|██████████| 42/42 [00:02<00:00, 14.68it/s]
100%|██████████| 42/42 [00:02<00:00, 14.52it/s]
100%|██████████| 68/68 [00:04<00:00, 15.21it/s]
100%|██████████| 452/452 [00:32<00:00, 14.05it/s]
100%|██████████| 87/87 [00:05<00:00, 14.53it/s]
100%|██████████| 421/421 [00:24<00:00, 16.99it/s]
100%|██████████| 53/53 [00:03<00:00, 14.95it/s]
100%|██████████| 1644/1644 [02:23<00:00, 11.44it/s]


'0,43 0,36 0,85 0,45 0,86 0,95 0,74 0,67 0,46'

In [5]:
pattern = '<hypo> or some other <hyper>'
scorer = HFLMScorer('gpt2-xl', 'cuda')
hs_model = GPTHypernymySuiteModel(scorer, pattern, 'data')


In [6]:
res = all_evaluations(hs_model)
print_res_table(res)

100%|██████████| 53/53 [00:06<00:00,  8.48it/s]
100%|██████████| 53/53 [00:06<00:00,  8.63it/s]
100%|██████████| 53/53 [00:06<00:00,  8.50it/s]
100%|██████████| 42/42 [00:05<00:00,  8.28it/s]
100%|██████████| 42/42 [00:05<00:00,  8.18it/s]
100%|██████████| 68/68 [00:07<00:00,  8.75it/s]
100%|██████████| 452/452 [00:57<00:00,  7.83it/s]
100%|██████████| 87/87 [00:10<00:00,  8.06it/s]
100%|██████████| 421/421 [00:43<00:00,  9.73it/s]
100%|██████████| 53/53 [00:06<00:00,  8.32it/s]
100%|██████████| 1644/1644 [04:19<00:00,  6.32it/s]


'0,46 0,34 0,87 0,46 0,86 0,94 0,72 0,67 0,54'

In [7]:
res

{'dir_wbless': {'acc_val_inv': 0.8098712173183651,
  'acc_test_inv': 0.7239376844430171},
 'dir_bibless': {'acc_val_inv': 0.7572009182959314,
  'acc_test_inv': 0.6664807387807571},
 'dir_dbless': {'acc_val': 0.9285714285714286,
  'acc_test': 0.9373433583959899,
  'acc_all': 0.9364248317127898,
  'acc_val_inv': 0.9285714285714286,
  'acc_test_inv': 0.9373433583959899,
  'acc_all_inv': 0.9364248317127898,
  'num_val': 140,
  'num_test': 1197,
  'num_oov_all': 0,
  'pct_oov_all': 0.0},
 'cor_hyperlex': {'rho_train': 0.5586631768738135,
  'rho_val': 0.5447425448058909,
  'rho_test': 0.5604587457824581,
  'rho_all': 0.5582632667721859,
  'rho_train_inv': 0.5586631768738135,
  'rho_val_inv': 0.5447425448058909,
  'rho_test_inv': 0.5447425448058909,
  'rho_all_inv': 0.5582632667721859,
  'num_all': 2163,
  'num_oov_all': 0,
  'pct_oov_all': 0.0},
 'siege_bless': {'other': {'ap_val': 0.44068169322138595,
   'ap_test': 0.45638594417953743,
   'ap100_val': 0.6696403942403542,
   'ap100_test': 0.

In [16]:
res

{'dir_wbless': {'acc_val_inv': 0.7994900147805672,
  'acc_test_inv': 0.7087304270697137},
 'dir_bibless': {'acc_val_inv': 0.7305384070006443,
  'acc_test_inv': 0.6403372970854118},
 'dir_dbless': {'acc_val': 0.9071428571428571,
  'acc_test': 0.9239766081871345,
  'acc_all': 0.9222139117427075,
  'acc_val_inv': 0.9071428571428571,
  'acc_test_inv': 0.9239766081871345,
  'acc_all_inv': 0.9222139117427075,
  'num_val': 140,
  'num_test': 1197,
  'num_oov_all': 0,
  'pct_oov_all': 0.0},
 'cor_hyperlex': {'rho_train': 0.4838805421075947,
  'rho_val': 0.45648896447855586,
  'rho_test': 0.4749770722254598,
  'rho_all': 0.4807383302613861,
  'rho_train_inv': 0.4838805421075947,
  'rho_val_inv': 0.45648896447855586,
  'rho_test_inv': 0.45648896447855586,
  'rho_all_inv': 0.4807383302613861,
  'num_all': 2163,
  'num_oov_all': 0,
  'pct_oov_all': 0.0},
 'siege_bless': {'other': {'ap_val': 0.38770808700372295,
   'ap_test': 0.4244664392510192,
   'ap100_val': 0.5558835333674718,
   'ap100_test': 

In [3]:
PROMPTS = {
    'gen': "<hyper> is more general than <hypo>",
    'spec': "<hypo> is more specific than <hyper>",
    'type': "<hypo> is a type of <hyper>",
    'hyper1': "<hypo> which is a (example|class|kind|. . . ) of <hyper>",
    'hyper2': "<hypo> which is a example of <hyper>",
    'hyper3': "<hypo> which is a class of <hyper>",
    'hyper4': "<hypo> which is a kind of <hyper>",
    'hyper5': "<hypo> which is a type of <hyper>",
    'hyper6': "<hypo> (and|or) (any|some) other <hyper>",
    'hyper7': "<hypo> and any other <hyper>",
    'hyper8': "<hypo> and some other <hyper>",
    'hyper9': "<hypo> or any other <hyper>",
    'hyper10': "<hypo> or some other <hyper>",
    'hyper11': "<hypo> which is called <hyper>",
    'hyper12': "<hypo> a special case of <hyper>",
    'hyper13': "<hypo> is an <hyper> that",
    'hyper14': "(Unlike|like) (most|all|any|other) <hyper>, <hypo>",
    'hyper15': "unlike most <hyper>, <hypo>",
    'hyper16': "unlike all <hyper>, <hypo>",
    'hyper17': "unlike any <hyper>, <hypo>",
    'hyper18': "unlike other <hyper>, <hypo>",
    'hyper19': "like most <hyper>, <hypo>",
    'hyper20': "like all <hyper>, <hypo>",
    'hyper21': "like any <hyper>, <hypo>",
    'hyper22': "like other <hyper>, <hypo>",
    'hyper23': "<hyper> including <hypo>",
}

In [10]:
model_name = 'gpt2-xl'
scorer = HFLMScorer(model_name, 'cuda')

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [10]:
import codecs
import json

for prompt_name in PROMPTS:
    pattern = PROMPTS[prompt_name]
    hs_model = GPTHypernymySuiteModel(scorer, pattern, 'data')
    res = all_evaluations(hs_model)
    print(model_name, ' ', pattern)
    print(print_res_table(res))

    res['pattern'] = pattern
    with codecs.open(f'output/{model_name}_{prompt_name}.json', 'w', 'utf-8') as file_descr:
        json.dump(res, file_descr)

100%|██████████| 53/53 [00:07<00:00,  6.80it/s]
100%|██████████| 53/53 [00:07<00:00,  7.04it/s]
100%|██████████| 53/53 [00:07<00:00,  7.00it/s]
100%|██████████| 42/42 [00:06<00:00,  6.76it/s]
100%|██████████| 42/42 [00:06<00:00,  6.82it/s]
100%|██████████| 68/68 [00:09<00:00,  7.24it/s]
100%|██████████| 452/452 [01:08<00:00,  6.60it/s]
100%|██████████| 87/87 [00:13<00:00,  6.53it/s]
100%|██████████| 421/421 [00:53<00:00,  7.83it/s]
100%|██████████| 53/53 [00:07<00:00,  6.72it/s]
100%|██████████| 1644/1644 [05:06<00:00,  5.36it/s]
  0%|          | 0/53 [00:00<?, ?it/s]

gpt2-xl   <hyper> is more general than <hypo>
0,10 0,25 0,64 0,34 0,50 0,31 0,49 0,25 0,11 0,33


100%|██████████| 53/53 [00:07<00:00,  6.68it/s]
100%|██████████| 53/53 [00:07<00:00,  6.78it/s]
100%|██████████| 53/53 [00:07<00:00,  6.79it/s]
100%|██████████| 42/42 [00:06<00:00,  6.63it/s]
100%|██████████| 42/42 [00:06<00:00,  6.56it/s]
100%|██████████| 68/68 [00:09<00:00,  7.10it/s]
100%|██████████| 452/452 [01:11<00:00,  6.36it/s]
100%|██████████| 87/87 [00:13<00:00,  6.48it/s]
100%|██████████| 421/421 [00:54<00:00,  7.75it/s]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
100%|██████████| 53/53 [00:07<00:00,  6.68it/s]
100%|██████████| 1644/1644 [05:07<00:00,  5.35it/s]
  0%|          | 0/53 [00:00<?, ?it/s]

gpt2-xl   <hypo> is more specific than <hyper>
0,11 0,23 0,66 0,35 0,54 0,59 0,52 0,37 0,22 0,40


100%|██████████| 53/53 [00:07<00:00,  6.68it/s]
100%|██████████| 53/53 [00:07<00:00,  6.78it/s]
100%|██████████| 53/53 [00:07<00:00,  6.79it/s]
100%|██████████| 42/42 [00:06<00:00,  6.63it/s]
100%|██████████| 42/42 [00:06<00:00,  6.55it/s]
100%|██████████| 68/68 [00:09<00:00,  7.10it/s]
100%|██████████| 452/452 [01:11<00:00,  6.36it/s]
100%|██████████| 87/87 [00:13<00:00,  6.48it/s]
100%|██████████| 421/421 [00:54<00:00,  7.75it/s]
100%|██████████| 53/53 [00:07<00:00,  6.68it/s]
100%|██████████| 1644/1644 [05:07<00:00,  5.35it/s]
  0%|          | 0/53 [00:00<?, ?it/s]

gpt2-xl   <hypo> is a type of <hyper>
0,38 0,34 0,87 0,44 0,81 0,91 0,69 0,63 0,53 0,62


100%|██████████| 53/53 [00:14<00:00,  3.55it/s]
100%|██████████| 53/53 [00:14<00:00,  3.56it/s]
100%|██████████| 53/53 [00:14<00:00,  3.55it/s]
100%|██████████| 42/42 [00:12<00:00,  3.49it/s]
100%|██████████| 42/42 [00:12<00:00,  3.48it/s]
100%|██████████| 68/68 [00:19<00:00,  3.57it/s]
100%|██████████| 452/452 [02:12<00:00,  3.42it/s]
100%|██████████| 87/87 [00:25<00:00,  3.47it/s]
100%|██████████| 421/421 [01:51<00:00,  3.76it/s]
100%|██████████| 53/53 [00:14<00:00,  3.54it/s]
100%|██████████| 1644/1644 [08:50<00:00,  3.10it/s]
  0%|          | 0/53 [00:00<?, ?it/s]

gpt2-xl   <hypo> which is a (example|class|kind|. . . ) of <hyper>
0,19 0,28 0,66 0,40 0,67 0,63 0,60 0,46 0,38 0,47


100%|██████████| 53/53 [00:08<00:00,  6.27it/s]
100%|██████████| 53/53 [00:08<00:00,  6.33it/s]
100%|██████████| 53/53 [00:08<00:00,  6.27it/s]
100%|██████████| 42/42 [00:06<00:00,  6.13it/s]
100%|██████████| 42/42 [00:06<00:00,  6.08it/s]
100%|██████████| 68/68 [00:10<00:00,  6.44it/s]
100%|██████████| 452/452 [01:16<00:00,  5.92it/s]
100%|██████████| 87/87 [00:14<00:00,  6.09it/s]
100%|██████████| 421/421 [00:59<00:00,  7.07it/s]
100%|██████████| 53/53 [00:08<00:00,  6.27it/s]
100%|██████████| 1644/1644 [05:27<00:00,  5.02it/s]
  0%|          | 0/53 [00:00<?, ?it/s]

gpt2-xl   <hypo> which is a example of <hyper>
0,19 0,25 0,69 0,39 0,70 0,80 0,62 0,54 0,33 0,50


100%|██████████| 53/53 [00:08<00:00,  6.26it/s]
100%|██████████| 53/53 [00:08<00:00,  6.32it/s]
100%|██████████| 53/53 [00:08<00:00,  6.26it/s]
100%|██████████| 42/42 [00:06<00:00,  6.12it/s]
100%|██████████| 42/42 [00:06<00:00,  6.08it/s]
100%|██████████| 68/68 [00:10<00:00,  6.44it/s]
100%|██████████| 452/452 [01:16<00:00,  5.91it/s]
100%|██████████| 87/87 [00:14<00:00,  6.08it/s]
100%|██████████| 421/421 [00:59<00:00,  7.06it/s]
100%|██████████| 53/53 [00:08<00:00,  6.27it/s]
100%|██████████| 1644/1644 [05:27<00:00,  5.02it/s]
  0%|          | 0/53 [00:00<?, ?it/s]

gpt2-xl   <hypo> which is a class of <hyper>
0,34 0,28 0,80 0,42 0,79 0,84 0,68 0,60 0,51 0,59


100%|██████████| 53/53 [00:08<00:00,  6.27it/s]
100%|██████████| 53/53 [00:08<00:00,  6.33it/s]
100%|██████████| 53/53 [00:08<00:00,  6.26it/s]
100%|██████████| 42/42 [00:06<00:00,  6.13it/s]
100%|██████████| 42/42 [00:06<00:00,  6.08it/s]
100%|██████████| 68/68 [00:10<00:00,  6.44it/s]
100%|██████████| 452/452 [01:16<00:00,  5.92it/s]
100%|██████████| 87/87 [00:14<00:00,  6.08it/s]
100%|██████████| 421/421 [00:59<00:00,  7.06it/s]
100%|██████████| 53/53 [00:08<00:00,  6.26it/s]
100%|██████████| 1644/1644 [05:27<00:00,  5.02it/s]
  0%|          | 0/53 [00:00<?, ?it/s]

gpt2-xl   <hypo> which is a kind of <hyper>
0,25 0,28 0,75 0,41 0,69 0,75 0,60 0,50 0,47 0,52


100%|██████████| 53/53 [00:08<00:00,  6.27it/s]
100%|██████████| 53/53 [00:08<00:00,  6.32it/s]
100%|██████████| 53/53 [00:08<00:00,  6.26it/s]
100%|██████████| 42/42 [00:06<00:00,  6.12it/s]
100%|██████████| 42/42 [00:06<00:00,  6.07it/s]
100%|██████████| 68/68 [00:10<00:00,  6.44it/s]
100%|██████████| 452/452 [01:16<00:00,  5.91it/s]
100%|██████████| 87/87 [00:14<00:00,  6.09it/s]
100%|██████████| 421/421 [00:59<00:00,  7.05it/s]
100%|██████████| 53/53 [00:08<00:00,  6.26it/s]
100%|██████████| 1644/1644 [05:27<00:00,  5.02it/s]
  0%|          | 0/53 [00:00<?, ?it/s]

gpt2-xl   <hypo> which is a type of <hyper>
0,28 0,29 0,77 0,42 0,71 0,78 0,61 0,52 0,47 0,54


100%|██████████| 53/53 [00:12<00:00,  4.28it/s]
100%|██████████| 53/53 [00:12<00:00,  4.30it/s]
100%|██████████| 53/53 [00:12<00:00,  4.27it/s]
100%|██████████| 42/42 [00:10<00:00,  4.20it/s]
100%|██████████| 42/42 [00:10<00:00,  4.18it/s]
100%|██████████| 68/68 [00:15<00:00,  4.33it/s]
100%|██████████| 452/452 [01:50<00:00,  4.10it/s]
100%|██████████| 87/87 [00:20<00:00,  4.18it/s]
100%|██████████| 421/421 [01:31<00:00,  4.62it/s]
100%|██████████| 53/53 [00:12<00:00,  4.28it/s]
100%|██████████| 1644/1644 [07:32<00:00,  3.63it/s]
  0%|          | 0/53 [00:00<?, ?it/s]

gpt2-xl   <hypo> (and|or) (any|some) other <hyper>
0,35 0,36 0,75 0,47 0,79 0,84 0,67 0,60 0,42 0,58


100%|██████████| 53/53 [00:07<00:00,  7.48it/s]
100%|██████████| 53/53 [00:07<00:00,  7.54it/s]
100%|██████████| 53/53 [00:07<00:00,  7.44it/s]
100%|██████████| 42/42 [00:05<00:00,  7.28it/s]
100%|██████████| 42/42 [00:05<00:00,  7.20it/s]
100%|██████████| 68/68 [00:08<00:00,  7.73it/s]
100%|██████████| 452/452 [01:05<00:00,  6.95it/s]
100%|██████████| 87/87 [00:12<00:00,  7.20it/s]
100%|██████████| 421/421 [00:48<00:00,  8.73it/s]
100%|██████████| 53/53 [00:07<00:00,  7.47it/s]
100%|██████████| 1644/1644 [04:48<00:00,  5.70it/s]
  0%|          | 0/53 [00:00<?, ?it/s]

gpt2-xl   <hypo> and any other <hyper>
0,40 0,35 0,84 0,47 0,84 0,93 0,70 0,64 0,53 0,63


100%|██████████| 53/53 [00:07<00:00,  7.48it/s]
100%|██████████| 53/53 [00:07<00:00,  7.53it/s]
100%|██████████| 53/53 [00:07<00:00,  7.44it/s]
100%|██████████| 42/42 [00:05<00:00,  7.28it/s]
100%|██████████| 42/42 [00:05<00:00,  7.20it/s]
100%|██████████| 68/68 [00:08<00:00,  7.73it/s]
  9%|▉         | 41/452 [00:05<00:57,  7.20it/s]

In [5]:
import codecs
import json
model_name = 'gpt2-xl'
models = ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
prompt_eval_data = {}
model_eval_data = {}
for model_name in models:
    for prompt_name in sorted(PROMPTS.keys()):
        pattern = PROMPTS[prompt_name]
        #hs_model = GPTHypernymySuiteModel(scorer, pattern, 'data')
        #res = all_evaluations(hs_model)
        print(model_name, ' ', pattern)
        

        #res['pattern'] = pattern
        with codecs.open(f'output/{model_name}_{prompt_name}.json', 'r', 'utf-8') as file_descr:
            res = json.load(file_descr)
            table_data, mean = print_res_table(res, True)
            if prompt_name not in prompt_eval_data:
                prompt_eval_data[prompt_name] = []
            prompt_eval_data[prompt_name].append(mean)

            if model_name not in model_eval_data:
                model_eval_data[model_name] = []
            model_eval_data[model_name].append(mean)





gpt2   <hyper> is more general than <hypo>
gpt2   <hypo> which is a (example|class|kind|. . . ) of <hyper>
gpt2   <hypo> or some other <hyper>
gpt2   <hypo> which is called <hyper>
gpt2   <hypo> a special case of <hyper>
gpt2   <hypo> is an <hyper> that
gpt2   (Unlike|like) (most|all|any|other) <hyper>, <hypo>
gpt2   unlike most <hyper>, <hypo>
gpt2   unlike all <hyper>, <hypo>
gpt2   unlike any <hyper>, <hypo>
gpt2   unlike other <hyper>, <hypo>
gpt2   like most <hyper>, <hypo>
gpt2   <hypo> which is a example of <hyper>
gpt2   like all <hyper>, <hypo>
gpt2   like any <hyper>, <hypo>
gpt2   like other <hyper>, <hypo>
gpt2   <hyper> including <hypo>
gpt2   <hypo> which is a class of <hyper>
gpt2   <hypo> which is a kind of <hyper>
gpt2   <hypo> which is a type of <hyper>
gpt2   <hypo> (and|or) (any|some) other <hyper>
gpt2   <hypo> and any other <hyper>
gpt2   <hypo> and some other <hyper>
gpt2   <hypo> or any other <hyper>
gpt2   <hypo> is more specific than <hyper>
gpt2   <hypo> is a

In [6]:
prompt_eval_data = sorted([[name, np.mean(data)] for name, data in prompt_eval_data.items()], key=lambda x: -x[1])
prompt_eval_data

[['hyper10', 0.6302596102485258],
 ['hyper13', 0.629544741055069],
 ['hyper9', 0.6276047237551061],
 ['hyper7', 0.6099226336488329],
 ['hyper21', 0.575173407484378],
 ['type', 0.5703122006602911],
 ['hyper6', 0.5578845267164985],
 ['hyper3', 0.5510195780118324],
 ['hyper17', 0.5473959547580849],
 ['hyper18', 0.5409707686995588],
 ['hyper8', 0.5400540871717074],
 ['hyper22', 0.5004303243717174],
 ['hyper19', 0.5002340698550253],
 ['hyper15', 0.49728501676016545],
 ['hyper5', 0.495150267138872],
 ['hyper20', 0.4894568597995863],
 ['hyper23', 0.4857014113642769],
 ['hyper2', 0.4856964757865996],
 ['hyper4', 0.4725073595139392],
 ['hyper16', 0.4696370338777007],
 ['hyper1', 0.4578795891479095],
 ['hyper14', 0.3886651891915355],
 ['spec', 0.37414205057870553],
 ['hyper12', 0.37338586050341005],
 ['gen', 0.36884028291939164],
 ['hyper11', 0.3370938709792304]]

In [7]:
model_eval_data = sorted([[name, np.mean(data)] for name, data in model_eval_data.items()], key=lambda x: -x[1])
model_eval_data

[['gpt2-xl', 0.5216751845144054],
 ['gpt2-large', 0.5112516347244956],
 ['gpt2-medium', 0.4948383896321819],
 ['gpt2', 0.48396523635937094]]

In [8]:
top_prompts = prompt_eval_data[:6]
top_prompts

[['hyper10', 0.6302596102485258],
 ['hyper13', 0.629544741055069],
 ['hyper9', 0.6276047237551061],
 ['hyper7', 0.6099226336488329],
 ['hyper21', 0.575173407484378],
 ['type', 0.5703122006602911]]

In [None]:
PROMPTS = {
    'gen': "<hyper> is more general than <hypo>",
    'spec': "<hypo> is more specific than <hyper>",
    'type': "<hypo> is a type of <hyper>",
    'hyper1': "<hypo> which is a (example|class|kind|. . . ) of <hyper>",
    'hyper2': "<hypo> which is a example of <hyper>",
    'hyper3': "<hypo> which is a class of <hyper>",
    'hyper4': "<hypo> which is a kind of <hyper>",
    'hyper5': "<hypo> which is a type of <hyper>",
    'hyper6': "<hypo> (and|or) (any|some) other <hyper>",
    'hyper7': "<hypo> and any other <hyper>",
    'hyper8': "<hypo> and some other <hyper>",
    'hyper9': "<hypo> or any other <hyper>",
    'hyper10': "<hypo> or some other <hyper>",
    'hyper11': "<hypo> which is called <hyper>",
    'hyper12': "<hypo> a special case of <hyper>",
    'hyper13': "<hypo> is an <hyper> that",
    'hyper14': "(Unlike|like) (most|all|any|other) <hyper>, <hypo>",
    'hyper15': "unlike most <hyper>, <hypo>",
    'hyper16': "unlike all <hyper>, <hypo>",
    'hyper17': "unlike any <hyper>, <hypo>",
    'hyper18': "unlike other <hyper>, <hypo>",
    'hyper19': "like most <hyper>, <hypo>",
    'hyper20': "like all <hyper>, <hypo>",
    'hyper21': "like any <hyper>, <hypo>",
    'hyper22': "like other <hyper>, <hypo>",
    'hyper23': "<hyper> including <hypo>",
}

In [4]:
model_name = 'EleutherAI/gpt-neo-1.3B'
scorer = HFLMScorer(model_name, 'cuda')

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


HBox(children=(IntProgress(value=0, description='Downloading', max=200, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=798156, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=456356, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=90, style=ProgressStyle(description_width='…




In [4]:
top_prompts = ['hyper10', 'hyper13', 'hyper21', 'type']
top_prompts

['hyper10', 'hyper13', 'hyper21', 'type']

In [5]:
top_prompts = ['hyper10', 'hyper13']
top_prompts

['hyper10', 'hyper13']

In [6]:

for i in range(2, len(top_prompts) + 1):
    prompt_names = [p for p in top_prompts[:i]]
    patterns = [PROMPTS[p] for p in prompt_names]
    hs_model = GPTHypernymySuiteModel(scorer, patterns, 'data')
    res = all_evaluations(hs_model)
    print(model_name, ' ', '|'.join(patterns))
    print(print_res_table(res))

    res['pattern'] = prompt_names
    prompt_name = '|'.join(prompt_names)
    mn = model_name.replace('/', '-')
    with codecs.open(f'output/{mn}_{prompt_name}.json', 'w', 'utf-8') as file_descr:
        json.dump(res, file_descr)


100%|██████████| 53/53 [00:06<00:00,  8.23it/s]
100%|██████████| 53/53 [00:06<00:00,  8.28it/s]
100%|██████████| 53/53 [00:06<00:00,  8.34it/s]
100%|██████████| 53/53 [00:06<00:00,  8.32it/s]
100%|██████████| 53/53 [00:06<00:00,  8.23it/s]
100%|██████████| 53/53 [00:06<00:00,  8.22it/s]
100%|██████████| 42/42 [00:05<00:00,  8.06it/s]
100%|██████████| 42/42 [00:05<00:00,  8.05it/s]
100%|██████████| 42/42 [00:05<00:00,  7.96it/s]
100%|██████████| 42/42 [00:05<00:00,  7.95it/s]
100%|██████████| 68/68 [00:08<00:00,  8.48it/s]
100%|██████████| 68/68 [00:08<00:00,  8.45it/s]
100%|██████████| 452/452 [00:58<00:00,  7.70it/s]
100%|██████████| 452/452 [00:59<00:00,  7.66it/s]
100%|██████████| 87/87 [00:11<00:00,  7.84it/s]
100%|██████████| 87/87 [00:11<00:00,  7.84it/s]
100%|██████████| 421/421 [00:41<00:00, 10.07it/s]
100%|██████████| 421/421 [00:41<00:00, 10.06it/s]
100%|██████████| 53/53 [00:06<00:00,  8.05it/s]
100%|██████████| 53/53 [00:06<00:00,  8.04it/s]
100%|██████████| 1644/1644 [04:1

EleutherAI/gpt-neo-1.3B   <hypo> or some other <hyper>|<hypo> is an <hyper> that
0,47 0,41 0,83 0,51 0,86 0,91 0,74 0,68 0,40 0,65


NameError: name 'codecs' is not defined

In [15]:
prompt_names

['h', 'h', 'h']

In [5]:
import codecs
import json

model_name = 'gpt2-xl'
for i in range(3, len(top_prompts) + 1):
    prompt_names = [p for p in top_prompts[:i]]
    patterns = [PROMPTS[p] for p in prompt_names]
    #hs_model = GPTHypernymySuiteModel(scorer, patterns, 'data')
    #res = all_evaluations(hs_model)
    print(model_name, ' ', '|'.join(patterns))
    #print(print_res_table(res))

    #res['pattern'] = prompt_names
    prompt_name = '|'.join(prompt_names)
    #with codecs.open(f'output/{model_name}_{prompt_name}.json', 'w', 'utf-8') as file_descr:
    #    json.dump(res, file_descr)
    with codecs.open(f'output/{model_name}_{prompt_name}.json', 'r', 'utf-8') as file_descr:
        res = json.load(file_descr)
        table_data, mean = print_res_table(res, True)
    print(table_data)

gpt2-xl   <hypo> or some other <hyper>|<hypo> is an <hyper> that|like any <hyper>, <hypo>
0,52 0,43 0,86 0,53 0,87 0,92 0,74 0,70 0,54 0,68
gpt2-xl   <hypo> or some other <hyper>|<hypo> is an <hyper> that|like any <hyper>, <hypo>|<hypo> is a type of <hyper>
0,51 0,41 0,87 0,51 0,86 0,93 0,73 0,69 0,56 0,67


In [None]:
import codecs
import json

for prompt_name in PROMPTS:
    pattern = PROMPTS[prompt_name]
    hs_model = GPTHypernymySuiteModel(scorer, pattern, 'data')
    res = all_evaluations(hs_model)
    print(model_name, ' ', pattern)
    print(print_res_table(res))

    res['pattern'] = pattern
    with codecs.open(f'output/{model_name}_{prompt_name}.json', 'w', 'utf-8') as file_descr:
        json.dump(res, file_descr)