In [1]:
import json
import re
import pathlib
import csv

from minicons import scorer
from tqdm import tqdm

from collections import defaultdict

In [2]:
# read jsonl
def read_jsonl(path):
    with open(path) as f:
        data = f.readlines()
    data = [json.loads(line) for line in data]
    return data

def find_and_split(sentence, target, condition):
    if target == ".":
        search_query = "\."
    elif target == ",":
        search_query = ","
    else:
        search_query = fr"\b{target}\b"
    if "pipp" in condition or condition == "no_filler_gap":
        search_index = 0
    else:
        search_index = -1
    search_results = list(re.finditer(search_query, sentence))[search_index].span()
    # print(search_results)
    return sentence[:search_results[0]].strip(), target

pipps = read_jsonl('../../data/pipps/materials.jsonl')

In [3]:
pipps_organized = defaultdict(list)

pipps_embedding_organized = defaultdict(list)

for pipp in pipps:
    if pipp['embedding'] == "":
        pipps_organized[pipp['preposition']].append(pipp)
    else:
        pipps_embedding_organized[pipp['preposition']].append(pipp)

In [6]:
pipps_organized['though'][32]

{'idx': 32,
 'item_num': 33,
 'preposition': 'though',
 'embedding': '',
 'pipp_filler_gap': {'sentence': 'The vacationers emphasized that the vacation was fun, frantic though it may have seemed.',
  'target': '.'},
 'pp_no_filler_no_gap': {'sentence': 'The vacationers emphasized that the vacation was fun, though it may have seemed frantic.',
  'target': 'frantic'},
 'filler_no_gap': {'sentence': 'The vacationers emphasized that the vacation was fun, frantic though it may have seemed frantic.',
  'target': 'frantic'},
 'no_filler_gap': {'sentence': 'The vacationers emphasized that the vacation was fun, though it may have seemed.',
  'target': '.'}}

In [4]:
find_and_split("The winner was chosen quickly, as dubious as the decision was.", ".", 'pipps')

('The winner was chosen quickly, as dubious as the decision was', '.')

In [4]:
model_name = "kanishka/smolm-autoreg-bpe-babylm-1e-3"
# model_name = "gpt2"
# model_name = ""

lm = scorer.IncrementalLMScorer(model_name, "cuda:1")

model_name = (
        model_name.replace("../smolm/models/", "")
        .replace("kanishka/", "")
        .replace("/", "_")
    )

model_name

OSError: kanishaka/smolm-autoreg-bpe-babylm-1e-3 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [10]:
for pipp in tqdm(pipps):
    for key in pipp:
        if key not in ['idx', 'item_num', 'embedding', 'preposition']:
            sentence = pipp[key]['sentence']
            target = pipp[key]['target']
            prefix, query = find_and_split(sentence, target, condition=key)
            if target == "." or target == ",":
                sep = ""
            else:
                sep = " "
            score = lm.conditional_score(prefix, query, separator=sep, reduction = lambda x: -x.mean(0).item(), base_two=True)
            pipp[key]['score'] = score[0]

100%|██████████| 198/198 [00:05<00:00, 33.08it/s]


In [1]:
pipps

NameError: name 'pipps' is not defined

In [15]:
pipps[34]

{'idx': 34,
 'item_num': 2,
 'preposition': 'though',
 'embedding': 'they said that we knew that',
 'pipp_filler_gap': {'sentence': 'Honorable though they said that we knew that their intentions were at the time, they were excluded.',
  'target': 'at',
  'score': 10.229843139648438},
 'pp_no_filler_no_gap': {'sentence': 'Though they said that we knew that their intentions were honorable at the time, they were excluded.',
  'target': 'honorable',
  'score': 6.029125690460205},
 'filler_no_gap': {'sentence': 'Honorable though they said that we knew that their intentions were honorable at the time, they were excluded.',
  'target': 'honorable',
  'score': 3.8941893577575684},
 'no_filler_gap': {'sentence': 'Though they said that we knew that their intentions were at the time, they were excluded.',
  'target': 'at',
  'score': 10.350200653076172}}

In [25]:
# pipps[34]
# find

{'idx': 34,
 'preposition': 'though',
 'embedding': 'they said that we knew that',
 'pipp_filler_gap': {'sentence': 'Honorable though they said that we knew that their intentions were at the time, they were excluded.',
  'target': 'at',
  'score': 6.969225883483887},
 'pp_no_filler_no_gap': {'sentence': 'Though they said that we knew that their intentions were honorable at the time, they were excluded.',
  'target': 'honorable',
  'score': 6.199956893920898},
 'filler_no_gap': {'sentence': 'Honorable though they said that we knew that their intentions were honorable at the time, they were excluded.',
  'target': 'honorable',
  'score': 6.201324462890625},
 'no_filler_gap': {'sentence': 'Though they said that we knew that their intentions were at the time, they were excluded.',
  'target': 'at',
  'score': 6.3501458168029785}}

In [16]:
lm.conditional_score("Honorable though they said that we knew that their intentions were", "at", base_two=True)

[-10.229843139648438]

In [17]:
find_and_split("Honorable though they said that we knew that their intentions were at the time, they were excluded.", "at", "pipp")

('Honorable though they said that we knew that their intentions were', 'at')

In [11]:
def write_to_csv(results, filename):
    with open(filename, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['idx', 'item_num', 'preposition', 'embedding', 'pipp_filler_gap', 'pp_no_filler_no_gap', 'filler_no_gap', 'no_filler_gap'])
        for result in results:
            writer.writerow([result['idx'], result['item_num'], result['preposition'], result['embedding'], result['pipp_filler_gap']['score'], result['pp_no_filler_no_gap']['score'], result['filler_no_gap']['score'], result['no_filler_gap']['score']])

In [12]:
pathlib.Path('../../data/results/pipps/').mkdir(parents=True, exist_ok=True)
write_to_csv(pipps, f'../../data/results/pipps/{model_name}.csv')

In [8]:
lm.token_score("Happy though we were with the idea, we decided to move on.", base_two=True)

[[('Happy', 0.0),
  ('though', -14.23239517211914),
  ('we', -5.657886505126953),
  ('were', -3.171238899230957),
  ('with', -6.144097805023193),
  ('the', -2.1990251541137695),
  ('idea', -7.152507781982422),
  (',', -2.537522792816162),
  ('we', -1.4758175611495972),
  ('decided', -5.810355186462402),
  ('to', -0.955513596534729),
  ('move', -6.110415458679199),
  ('on', -2.3879406452178955),
  ('.', -2.4258055686950684)]]

In [37]:
(-12.507351875305176 -0.00291132228448987)/2

-6.255131598794833

In [13]:
pipps[32]

{'idx': 32,
 'preposition': 'though',
 'embedding': '',
 'pipp_filler_gap': {'sentence': 'The vacationers emphasized that the vacation was fun, frantic though it may have seemed.',
  'target': '.',
  'score': 1.3217829465866089},
 'pp_no_filler_no_gap': {'sentence': 'The vacationers emphasized that the vacation was fun, though it may have seemed frantic.',
  'target': 'frantic',
  'score': 15.600930213928223},
 'filler_no_gap': {'sentence': 'The vacationers emphasized that the vacation was fun, frantic though it may have seemed frantic.',
  'target': 'frantic',
  'score': 23.60569190979004},
 'no_filler_gap': {'sentence': 'The vacationers emphasized that the vacation was fun, though it may have seemed.',
  'target': '.',
  'score': 12.073512077331543}}

In [18]:
lm.token_score("The vacationers emphasized that the vacation was fun, frantic though it may have seemed frantic.", base_two=True)

[[('The', 0.0),
  ('vacation', -16.906583786010742),
  ('ers', -5.41076135635376),
  ('emphasized', -15.571462631225586),
  ('that', -1.1680978536605835),
  ('the', -2.063772439956665),
  ('vacation', -9.088308334350586),
  ('was', -3.983067274093628),
  ('fun', -8.611307144165039),
  (',', -1.6558159589767456),
  ('frantic', -18.40604591369629),
  ('though', -12.963072776794434),
  ('it', -0.2623327076435089),
  ('may', -2.8368868827819824),
  ('have', -0.9901316165924072),
  ('seemed', -5.349593162536621),
  ('frantic', -23.60569190979004),
  ('.', -1.239820122718811)]]