In [1]:
import spacy
import utils

from collections import defaultdict
from joblib import Parallel, delayed
from multiprocessing import Manager
from tqdm import tqdm

from spacy.tokens import Doc

In [2]:
corpus = utils.read_file("/home/km55359/rawdata/babylm_data/babylm_100M/babylm_100M_train.txt")

In [3]:
mini_corpus = [
    "A few days should be enough for that task!",
    "A hundred dollars is a lot of money.",
    "A few hundred dollars is all we need for this mission.",
    "The hundred dollars I spent ended up being a lot of money.",
    "Five days is a long time to wait for a response.",
    "Seven weeks of thesis writing takes a toll on one's body.",
    "A year is a long time to wait.",
    "Five people is a not a huge number of attendees.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Five people is a not a huge number of attendees.",
    "A dozen eggs is too many.",
    "A dozen eggs was too many.",
    "All it took was seven days.",
    "He was going to the bathroom, and all he took was seven minutes.",
    "7 days is what it took for him to get over it.",
    "He takes 7 days to get over it."
]

mini_corpus = list(enumerate(corpus[:100000]))

In [4]:
100000/16

6250.0

In [5]:
model = spacy.load("en_core_web_sm")

In [6]:
def has_plural_noun(doc):
    found = False
    for token in doc:
        if token.tag_ == "NNS" or token.tag_ == "NNPS":
            found = True
            break
    return found

def nummod(noun):
    found = False
    for child in noun.children:
        if child.dep_ == "nummod" or child.dep_ == "quantmod":
            found = True
            break
    return found

def singular_verb(noun):
    sv = False
    head_morph = noun.head.morph.to_dict()
    if 'Number' in head_morph.keys():
        if head_morph['Number'] == "Sing":
            sv = True
    return sv

noun_deps = ["nsubj", "attr"]

In [7]:
# batch into groups of 1k
batch_size = 6250
batches = []
batch = []
for line in mini_corpus:
    if batch_size < 0:
        batches.append(line)
    else:
        batch.append(line)
        if len(batch) == batch_size:
            batches.append(batch)
            batch = []
if batch:
    batches.append(batch)

In [8]:
len(batches)

16

In [23]:
idx=0
docs = model.pipe(corpus, batch_size = 2048)
sing_verb_deps = defaultdict(list)
for doc in tqdm(docs):
    for token in doc:
        if token.tag_ == "NNS" or token.tag_ == "NNPS":
            # two potential connections (ideally both are satisfied)
            # the noun is linked to a singular verb via the n-subj relation
            # the noun is has a nummod relation with another token.
            # print(token.text, token.dep_, token.head.text, token.head.morph)
            number = nummod(token)
            sv = singular_verb(token)
            if number and sv:
                found = True
                sing_verb_deps[token.dep_].append((idx, token.text, token.head.text))
                break
    idx+=1

10175732it [2:16:50, 1239.31it/s]


In [28]:
corpus[4112702]

"Let's study this seven limbs is good, so is eight limbs, nine limbs..."

In [32]:
sing_verb_deps['nsubj']

[(40793, 'books', "'s"),
 (56611, 'balls', 'stays'),
 (63527, 'seconds', 'is'),
 (64292, 'seconds', 'is'),
 (99224, 'buckles', 'see'),
 (107525, 'ones', "'s"),
 (111495, 'legos', "'s"),
 (130378, 'elastics', "'s"),
 (157132, 'dogs', 'woof'),
 (204795, 'animals', 'is'),
 (227986, 'cookies', "'s"),
 (228006, 'bugs', "'s"),
 (228023, 'scoops', "'s"),
 (228062, 'hats', "'s"),
 (228066, 'dawns', "'s"),
 (228375, 'slices', 'is'),
 (234774, 'packets', "'m"),
 (236694, 'oops', "'s"),
 (238386, 'cups', 'is'),
 (253157, 'melts', 'takes'),
 (268688, 'lumps', 'is'),
 (281203, 'dollars', 'rings'),
 (295892, 'reds', 'goes'),
 (295922, 'blues', "'s"),
 (306305, 'weeks', 'is'),
 (313218, 'cars', "'s"),
 (316795, 'eggs', 'is'),
 (321251, 'cowboys', "'s"),
 (321253, 'nuggets', "'s"),
 (336673, 'feet', 'is'),
 (346233, 'ears', 'does'),
 (347528, 'reasons', 'is'),
 (352673, 'chairs', "'s"),
 (401979, 'pigs', "'s"),
 (417144, 'ducks', 'is'),
 (457521, 'chairs', "'s"),
 (463112, 'pennies', "'s"),
 (484790, 

In [34]:
# save single verb dependencies as csv in data/babylm-analysis
import pandas as pd

# save to list first
sv_list = []
for dep in sing_verb_deps.keys():
    for idx, noun, verb in sing_verb_deps[dep]:
        sv_list.append((dep, idx, noun, verb))

# convert to dataframe
sv_df = pd.DataFrame(sv_list, columns=["dep", "idx", "noun", "verb"])

# save to csv
sv_df.to_csv("../data/babylm-analysis/singular-verb-dependencies.csv", index=False)

In [14]:
sing_verb_deps_2

[(6410, 'attr', 'minutes', "'s"),
 (8206, 'npadvmod', 'minutes', "'s"),
 (8273, 'attr', 'minutes', 'is'),
 (11703, 'attr', 'outfits', "'s"),
 (12424, 'dobj', 'feet', 'khkhkhkhkh'),
 (13229, 'attr', 'times', "'s"),
 (16015, 'attr', 'cups', 'was'),
 (22817, 'attr', 'months', "'s"),
 (24291, 'dobj', 'downstairs', 'needs'),
 (24957, 'attr', 'ones', "'s"),
 (27388, 'conj', 'minutes', "o'clock"),
 (35810, 'attr', 'birds', 'is'),
 (40793, 'nsubj', 'books', "'s"),
 (43817, 'attr', 'inches', "'m"),
 (44884, 'attr', 'minutes', "'s"),
 (47581, 'appos', 'blocks', 'block'),
 (53343, 'attr', 'shoes', "'s"),
 (53666, 'dobj', 'spoons', 'has'),
 (53690, 'dobj', 'spoons', 'spoons'),
 (53719, 'attr', 'spoons', "'s"),
 (56611, 'nsubj', 'balls', 'stays'),
 (57178, 'appos', 'kids', 'kittie'),
 (57190, 'appos', 'chicks', 'kittie'),
 (57206, 'appos', 'pups', 'kittie'),
 (59206, 'dobj', 'spiders', 'spots'),
 (59215, 'dobj', 'ducks', 'counts'),
 (63527, 'nsubj', 'seconds', 'is'),
 (64292, 'nsubj', 'seconds', 'i

In [9]:
sing_verb_deps = Manager().list()

def nouns_with_singular_verbs(batches, manager):
    for inp in batches:
        idx, doc = inp
        doc = model(doc)
        for token in doc:
            if token.tag_ == "NNS" or token.tag_ == "NNPS":
                # two potential connections (ideally both are satisfied)
                # the noun is linked to a singular verb via the n-subj relation
                # the noun is has a nummod relation with another token.
                # print(token.text, token.dep_, token.head.text, token.head.morph)
                number = nummod(token)
                sv = singular_verb(token)
                if number and sv:
                    found = True
                    manager.append((idx, token.dep_, token.text, token.head.text))
                    break

pbar = tqdm(batches)
_ = Parallel(n_jobs=8, prefer="threads")(delayed(nouns_with_singular_verbs)(entry, sing_verb_deps) for entry in pbar)

  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [02:57<00:00, 11.09s/it]


In [None]:
len(sing_verb_deps)

In [6]:
sing_verb_deps = Manager().list()

def nouns_with_singular_verbs(inp, manager):
    idx, doc = inp
    doc = model(doc)
    for token in doc:
        if token.tag_ == "NNS" or token.tag_ == "NNPS":
            # two potential connections (ideally both are satisfied)
            # the noun is linked to a singular verb via the n-subj relation
            # the noun is has a nummod relation with another token.
            # print(token.text, token.dep_, token.head.text, token.head.morph)
            number = nummod(token)
            sv = singular_verb(token)
            if number and sv:
                found = True
                manager.append((idx, token.dep_, token.text, token.head.text))
                break

pbar = tqdm(mini_corpus)
_ = Parallel(n_jobs=2, prefer="threads")(delayed(nouns_with_singular_verbs)(entry, sing_verb_deps) for entry in pbar)

  2%|███▏                                                                                                                                                                                                        | 1557/100000 [00:04<04:59, 328.70it/s]
KeyboardInterrupt



In [151]:
# 16 = 6:47
# 32 = 
sing_verb_deps[:]

[(6410, 'attr', 'minutes', "'s"),
 (8206, 'npadvmod', 'minutes', "'s"),
 (8273, 'attr', 'minutes', 'is'),
 (11703, 'attr', 'outfits', "'s"),
 (12424, 'dobj', 'feet', 'khkhkhkhkh'),
 (13229, 'attr', 'times', "'s"),
 (16015, 'attr', 'cups', 'was'),
 (22817, 'attr', 'months', "'s"),
 (24291, 'dobj', 'downstairs', 'needs'),
 (24957, 'attr', 'ones', "'s"),
 (27388, 'conj', 'minutes', "o'clock"),
 (35810, 'attr', 'birds', 'is'),
 (40793, 'nsubj', 'books', "'s"),
 (43817, 'attr', 'inches', "'m"),
 (44884, 'attr', 'minutes', "'s"),
 (47581, 'appos', 'blocks', 'block'),
 (53343, 'attr', 'shoes', "'s"),
 (53666, 'dobj', 'spoons', 'has'),
 (53690, 'dobj', 'spoons', 'spoons'),
 (53719, 'attr', 'spoons', "'s"),
 (56611, 'nsubj', 'balls', 'stays'),
 (57178, 'appos', 'kids', 'kittie'),
 (57190, 'appos', 'chicks', 'kittie'),
 (57206, 'appos', 'pups', 'kittie'),
 (59206, 'dobj', 'spiders', 'spots'),
 (59215, 'dobj', 'ducks', 'counts'),
 (63527, 'nsubj', 'seconds', 'is'),
 (64292, 'nsubj', 'seconds', 'i

In [152]:
corpus[99224]

'look honey that one buckles to this one see .'

In [139]:
for i, s in enumerate(corpus):
    if "five minutes" in s:
        print(i)

11951
22512
27388
57552
59808
60047
65184
71096
73543
159255
166929
168701
168702
170883
171526
175866
196454
205004
226422
228639
283159
306946
310150
392517
397902
402968
406103
406104
408759
471797
472882
487308
487310
500328
500329
502784
502785
509788
557943
567339
575286
577709
590965
590966
593838
601899
619773
627484
627493
650214
650216
654188
654191
660437
662310
663542
714436
720739
725398
729504
729505
734730
736615
741687
741691
754489
767570
767572
768593
769093
776709
776721
787777
789027
792337
793836
796211
796922
796964
799972
799975
807269
814602
815289
818587
822225
822870
823018
824759
825146
825813
827691
828250
832870
833219
833754
838573
838691
838692
838693
839312
840572
840856
841424
841741
841766
842057
842061
846128
846946
848578
851630
855156
855342
857849
867444
872294
872893
875967
875973
875990
879570
879838
884099
885079
889306
889312
894474
894478
894480
902255
914692
916450
917556
920467
921703
922936
924570
924573
925482
930977
934043
934263
936088
9

In [None]:
# store all pl nouns:

In [115]:
sing_verb_deps = defaultdict(list)
    
for idx, line in enumerate(mini_corpus):
    doc = model(line)
    found = False
    for token in doc:
        if token.tag_ == "NNS" or token.tag_ == "NNPS":
            # two potential connections (ideally both are satisfied)
            # the noun is linked to a singular verb via the n-subj relation
            # the noun is has a nummod relation with another token.
            # print(token.text, token.dep_, token.head.text, token.head.morph)
            number = nummod(token)
            sv = singular_verb(token)
            if number and sv:
                found = True
                sing_verb_deps[token.dep_].append((idx, token.text, token.head.text))
                break

In [118]:
mini_corpus[8206]

"five more minutes and then it's bath time okay ?"

In [116]:
sing_verb_deps

defaultdict(list,
            {'attr': [(6410, 'minutes', "'s"), (8273, 'minutes', 'is')],
             'npadvmod': [(8206, 'minutes', "'s")]})

In [47]:
for c in doc[2].children:
    print(c, c.dep_)

dozen nummod


In [39]:
doc[2].morph.to_dict()

{'Number': 'Plur'}

In [72]:
doc[2].head.morph.to_dict()

{'Mood': 'Ind',
 'Number': 'Sing',
 'Person': '3',
 'Tense': 'Past',
 'VerbForm': 'Fin'}

In [74]:
doc[4].has_morph()

True