In [80]:
import pandas
import json
import jsonlines

vocab_path = ...
tinystories_path = ... # Insert path to one of the JSON files with prompts and info.

blimp_agreement_1 = ...
blimp_agreement_1_new = ...
blimp_agreement_1_newer = ...

## First Attempt

We want to generate a new test set for verb agreement. Steps:

* Get the verbs from TinyStories
* Get the nouns from TinyStories
* Get the adjectives from TinyStories
* Get the vocab from BLiMP
* Find intersection

In [2]:
# Load the TinyStories JSON
with open(tinystories_path, 'r') as f:
    d = json.load(f)

# Get the list of word lists
wordlist = list(map(lambda x: x.get('instruction').get('words'), d))

# Separate into verbs, nouns, adjectives
verbset = set()
nounset = set()
adjset = set()
for i in wordlist:
    verbset.add(i[0])
    nounset.add(i[1])
    adjset.add(i[2])
verbs = list(verbset)
nouns = list(nounset)
adjs = list(adjset)

In [4]:
# Get the BLiMP vocab
vocab = pandas.read_csv(vocab_path)

verbs = vocab[['expression']][vocab['category_2'].str.contains('V') == True]
verblist = verbs['expression'].to_list()
nouns = vocab[['expression']][vocab['category_2'].str.contains('N') == True]
nounlist = nouns['expression'].to_list()

In [27]:
# Get bare forms of verbs also present in the TinyStories prompts
newverbs = set()
for item in verblist:
    if (item in verbset):
        newverbs.add(item)

# Cheap way to get inflected forms
totalverbs = set()
roots = set([verb[0:3] for verb in newverbs])
for verb in verblist:
    if verb[0:3] in roots:
        totalverbs.add(verb)

totalverbs = list(totalverbs)

In [16]:
dictlist = []
with jsonlines.open(blimp_agreement_1) as reader:
    for obj in reader:
        dictlist.append(obj)

In [50]:
goodcases = []
for case in dictlist:
    if any(verb in case['sentence_good'].lower() for verb in totalverbs):
        goodcases.append(case)

In [56]:
with jsonlines.open(blimp_agreement_1_new, 'w') as writer:
    writer.write_all(goodcases)

## Second Attempt

* Use part-of-speech tagger to find most frequent verbs in TinyStories
* Select subset of BLiMP for those

In [4]:
# Load the TinyStories JSON
with open(tinystories_path, 'r') as f:
    d = json.load(f)

# Get the list of word lists
stories = list(map(lambda x: x.get('story'), d))

In [22]:
import spacy
from tqdm import tqdm

In [26]:
!pip install 'spacy[apple]'

Collecting thinc-apple-ops<2.0.0,>=1.0.0 (from spacy[apple])
  Downloading thinc_apple_ops-1.0.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.6 kB)
Downloading thinc_apple_ops-1.0.0-cp311-cp311-macosx_11_0_arm64.whl (156 kB)
Installing collected packages: thinc-apple-ops
Successfully installed thinc-apple-ops-1.0.0


In [27]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [18]:
nlp = spacy.load("en_core_web_sm")

In [28]:
#doc = list(map(lambda x: nlp(x), stories))

doc = []
for story in tqdm(stories):
    doc.append(nlp(story))
        

 20%|██        | 20085/100000 [03:40<14:37, 91.07it/s] 


KeyboardInterrupt: 

In [41]:
doc[0][0].orth_

'Once'

In [51]:
verbs = []
for text in doc:
    for token in text:
        if token.pos_ == 'VERB':
            verbs.append(token.orth_)

In [55]:
from collections import Counter

In [106]:
verb_count = Counter(verblist)
common_verbs, _ = zip(*verb_count.most_common(200))

In [119]:
any(verb in 'Every stimulus disgusts Janet.'.split() for verb in common_verbs)

False

In [110]:
# Get the BLiMP vocab
vocab = pandas.read_csv(vocab_path)

verbs = vocab[['expression']][vocab['category_2'].str.contains('V') == True]
verblist = verbs['expression'].to_list()

In [111]:
dictlist = []
with jsonlines.open(blimp_agreement_1) as reader:
    for obj in reader:
        dictlist.append(obj)

In [120]:
goodcases = []
for case in dictlist:
    if any(verb in case['sentence_good'].lower().split() for verb in common_verbs):
        goodcases.append(case)

In [122]:
with jsonlines.open(blimp_agreement_1_newer, 'w') as writer:
    writer.write_all(goodcases)