In [1]:
import glob
import re
import spacy

from collections import defaultdict
from utils import read_babylm
from tqdm import tqdm

from joblib import Parallel, delayed
from multiprocessing import Manager

import pickle


In [19]:
nlp = spacy.load("en_core_web_sm")

sentences = [
    "A few days should be enough for that task!",
    "A hundred dollars is a lot of money.",
    "A few hundred dollars is all we need for this mission.",
    "The hundred dollars I spent ended up being a lot of money.",
    "Five days is a long time to wait for a response.",
    "Seven weeks of thesis writing takes a toll on one's body.",
    "A year is a long time to wait.",
    "Five people is a not a huge number of attendees.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Five people is a not a huge number of attendees.",
    "A dozen eggs is too many.",
]

In [20]:
def is_plural_noun(token):
    return token.pos_ == "NOUN" and (token.tag_ == "NNS" or token.tag_ == "NNPS")

def get_number(token):
    '''get the number of the token from its morph representation'''
    morph = token.morph
    if morph.get("Number") == "Sing":
        return "sg"
    elif morph.get("Number") == "Plur":
        return "pl"
    else:
        return False

In [24]:
relevant_sentences = []
for sentence in sentences:
    doc = nlp(sentence)
    found = False
    for token in doc:
        print(token.text, [(child.text, child.dep_) for child in token.children], [(ancestor.text, ancestor.dep_) for ancestor in token.ancestors])
        # print(token.text, token.pos_, token.dep_, token.head.text)
    #     head = token.head
    #     if token.dep_ == "amod" or token.dep_ == "nummod":
    #         if is_plural_noun(head) and head.:
    #             found=True
    #             relevant_sentences.append(sentence)
    #             # break
    #         # print(head.text, token.text, token.dep_, token.head.text)
    print("")

A [] [('days', 'nsubj'), ('be', 'ROOT')]
few [] [('days', 'nsubj'), ('be', 'ROOT')]
days [('A', 'det'), ('few', 'amod')] [('be', 'ROOT')]
should [] [('be', 'ROOT')]
be [('days', 'nsubj'), ('should', 'aux'), ('enough', 'acomp'), ('!', 'punct')] []
enough [('for', 'prep')] [('be', 'ROOT')]
for [('task', 'pobj')] [('enough', 'acomp'), ('be', 'ROOT')]
that [] [('task', 'pobj'), ('for', 'prep'), ('enough', 'acomp'), ('be', 'ROOT')]
task [('that', 'det')] [('for', 'prep'), ('enough', 'acomp'), ('be', 'ROOT')]
! [] [('be', 'ROOT')]

A [] [('hundred', 'nummod'), ('dollars', 'nsubj'), ('is', 'ROOT')]
hundred [('A', 'quantmod')] [('dollars', 'nsubj'), ('is', 'ROOT')]
dollars [('hundred', 'nummod')] [('is', 'ROOT')]
is [('dollars', 'nsubj'), ('lot', 'attr'), ('.', 'punct')] []
a [] [('lot', 'attr'), ('is', 'ROOT')]
lot [('a', 'det'), ('of', 'prep')] [('is', 'ROOT')]
of [('money', 'pobj')] [('lot', 'attr'), ('is', 'ROOT')]
money [] [('of', 'prep'), ('lot', 'attr'), ('is', 'ROOT')]
. [] [('is', 'RO

In [11]:
relevant_sentences

['A few days should be enough for that task!',
 'A hundred dollars is a lot of money.',
 'Five days is a long time to wait for a response.',
 "Seven weeks of thesis writing takes a toll on one's body.",
 'Five people is a not a huge number of attendees.',
 'Revenue exceeded twelve billion dollars, with a loss of $1b.',
 'Five people is a not a huge number of attendees.',
 'A dozen eggs is too many.']

In [88]:
ddoc = nlp("five days of that is fun!")
for d in ddoc:
    if d.text == "days":
        break

In [97]:
list(d.children)[0].dep_

'nummod'

In [89]:
d.head.dep_, d.dep_, list(d.ancestors)

('ROOT', 'nsubj', [is])

In [None]:
'''
read all sentences and parse into batches --> store
run spacy through batches --> store all plural nouns of sentences one entry + sentence, if empty then we store empty things.
'''

In [4]:
def read_sents(path, batch_size=-1):
    batches = []
    with open(path) as f:
        batch = []
        for line in f:
            if batch_size < 0:
                batches.append(line.strip())
            else:
                batch.append(line.strip())
                if len(batch) == batch_size:
                    batches.append(batch)
                    batch = []
        if batch:
            batches.append(batch)
    return batches

In [5]:
babylm = read_sents("/home/km55359/rawdata/babylm_data/babylm_100M/sents/babylm_sents.txt")
len(babylm)

11632017

In [None]:
1208821

In [None]:
# def sentence_tokenize(lst):
#     for item in lst:
#         sentences.extend(sent_tokenize(item))
#     return


# def sentence_tokenize_parallel(lst):
#     pbar = tqdm(lst)
#     _ = Parallel(n_jobs=10)(delayed(sentence_tokenize)(f) for f in pbar)
#     return

In [6]:
def retrieve_plural_nouns(doc):
    pl_nouns = []
    for token in doc:
        if is_plural_noun(token):
            pl_nouns.append(token)
    return pl_nouns

In [8]:
PL_NOUNS = []
for doc in tqdm(nlp.pipe(babylm, disable=["lemmatizer", "ner"], n_process=32)):
    pl_nouns = retrieve_plural_nouns(doc)
    PL_NOUNS.append(pl_nouns)

11632017it [2:41:35, 1199.73it/s]


In [17]:
list(noun.ancestors)

[BE]

In [87]:
head.tag_

'VB'

In [98]:
indices = set()
relevant = []
for i, nouns in enumerate(tqdm(PL_NOUNS)):
    # check if the noun has an ancestor verb that is singular
    for noun in nouns:
        head = noun.head
        morph = head.morph
        if (head.pos_ == "VERB" or head.pos_ == "AUX") and morph.get("Number") == ["Sing"]:
            indices.add(i)
            relevant.append((i, noun.text, head.text, noun.pos_, head.pos_, head.dep_, noun.dep_))
            break

nummods_indices = set()
relevant_nummods = []
for i, nouns in enumerate(tqdm(PL_NOUNS)):
    for noun in nouns:
        for child in noun.children:
            if child.dep_ == "nummod" or child.text == "few" or child.text == "dozen" or child.text == "couple":
                nummods_indices.add(i)
                relevant_nummods.append((i, noun.text, child.text, noun.pos_, child.pos_, child.dep_, noun.dep_))
                break

  0%|          | 0/11632017 [00:00<?, ?it/s]

100%|██████████| 11632017/11632017 [00:07<00:00, 1503995.23it/s]
100%|██████████| 11632017/11632017 [00:09<00:00, 1173938.12it/s]


In [103]:
intersect = indices.intersection(nummods_indices)

In [108]:
babylm[5898245]

'The lesser roadrunner has zygodactyl feet, meaning it has two toes on the back of its feet and two toes on the front.'

In [109]:
# save both relevant values as csvs:
import pandas as pd
df = pd.DataFrame(relevant, columns=["idx", "noun", "verb", "noun_pos", "verb_pos", "verb_dep", "noun_dep"])
df.to_csv("../data/babylm-analysis/plural_nouns_with_singular_heads.csv", index=False)

df = pd.DataFrame(relevant_nummods, columns=["idx", "noun", "numeral", "noun_pos", "numeral_pos", "numeral_dep", "noun_dep"])
df.to_csv("../data/babylm-analysis/plural_nouns_with_nummods.csv", index=False)

In [49]:
list(sorted(list(indices)))[:10]

[36, 52, 158, 237, 293, 297, 300, 301, 377, 396]

In [62]:
babylm[537], relevant[10]

('White has 2 options after this.', (406, 'kinds', "'s", 'relcl', 'ccomp'))

In [66]:
deps = defaultdict(list)
for entry in relevant:
    deps[entry[4]].append(list(entry)[:4])
deps = dict(deps)

In [77]:
babylm[7696]

'0n the open grasslands of central Asia, the steppes, lives an animal that issues invitations to prospective partners with a perfume.'

In [69]:
deps['nsubj']

[[158, 'messages', 'is', 'ROOT'],
 [559, 'defends', 'works', 'ROOT'],
 [1051, 'Euronews', 'has', 'ROOT'],
 [1068, 'injections', 'is', 'ROOT'],
 [1547, 'E', 'compares', 'ROOT'],
 [2484, 'chips', "'s", 'ROOT'],
 [2586, 'monkeys', "'s", 'ROOT'],
 [6657, 'Economics', 'tends', 'ROOT'],
 [7696, 'grasslands', 'lives', 'ROOT'],
 [9379, 'peoples', 'was', 'ROOT'],
 [10978, 'species', 'is', 'ROOT'],
 [11108, 'injuries', 'leaves', 'ROOT'],
 [12107, 'supervisors', 'does', 'advcl'],
 [12402, 'remains', 'is', 'ROOT'],
 [12616, 'Phascolarctos', 'is', 'ROOT'],
 [12734, 'variations', 'is', 'ROOT'],
 [12856, 'clothes', 'is', 'ROOT'],
 [12955, 'azygos', 'means', 'ROOT'],
 [13219, 'Contemplations', 'suggests', 'ROOT'],
 [13534, 'pounds', 's', 'ccomp'],
 [13916, 'Foulds', 'is', 'ROOT'],
 [13924, 'codice_1', 'is', 'ROOT'],
 [13948, 'codice_1', 'serves', 'ROOT'],
 [14029, '1980s', 'was', 'ROOT'],
 [14053, 'years', 'is', 'ccomp'],
 [14246, 'clerks', 'is', 'ROOT'],
 [14523, 'Bubbles', 'organizes', 'ROOT'],
 [15

In [68]:
len(deps), {k:len(v) for k, v in deps.items()}

(31,
 {'dobj': 60213,
  'attr': 37499,
  'nsubj': 23181,
  'ccomp': 870,
  'conj': 4057,
  'advcl': 956,
  'npadvmod': 5644,
  'dep': 1972,
  'oprd': 120,
  'acomp': 165,
  'compound': 113,
  'dative': 263,
  'nmod': 9,
  'intj': 136,
  'poss': 10,
  'appos': 130,
  'pobj': 7,
  'csubj': 31,
  'cc': 6,
  'punct': 173,
  'prep': 17,
  'xcomp': 28,
  'parataxis': 50,
  'relcl': 8,
  'advmod': 52,
  'neg': 1,
  'mark': 1,
  'nsubjpass': 5,
  'prt': 1,
  'amod': 1,
  'meta': 1})

In [42]:
# manager = Manager()
# sentences = manager.list()

'It may not look like much now but this is actually going to turn into one of the three student lounges which is a really big step up from our old campus.'

In [23]:
parsed = nlp(sentences[3])
for token in parsed:
    if is_plural(token):
        print(token, [(a, a.morph.get("Number")) for a in token.ancestors])

people [(is, ['Sing'])]
attendees [(of, []), (number, ['Sing']), (is, ['Sing'])]


In [97]:
def singular_plural(parsed):
    found = False
    materials = []
    for token in parsed:
        if "VB" in token.tag_:
            morph = token.morph.to_dict()
            if "Number" in morph.keys():
                if morph['Number'] == 'Sing':
                    for child in token.children:
                        child_morph = child.morph.to_dict()
                        # check if child is noun
                        if child.pos_ == 'NOUN':
                            if child.dep_ == 'nsubj' and child_morph['Number'] == 'Plur':
                                materials.append([token.text, child.text])
                                found = True
                                break
    return found, materials

# only do pos tags and dependency parses
# for sentence in sentences:
#     doc = nlp(sentence)
#     if singular_plural(doc):
#         print(sentence)
docs = []
targets = []
for doc in tqdm(nlp.pipe(sentences, disable=["lemmatizer", "ner"], n_process=32, batch_size=10000)):
    docs.append(doc.text)
    if singular_plural(doc):
        targets.append(doc.text)
    

Five days is a long time to wait for a response.
Seven weeks of thesis writing takes a toll on one's body.
A year is a long time to wait.
Five people is a not a huge number of attendees.
Revenue exceeded twelve billion dollars, with a loss of $1b.
Net income was $9.4 million compared to the prior year of $2.7 million.
Five people is a not a huge number of attendees.


In [26]:
# corpus = read_babylm(f"/home/km55359/rawdata/babylm_data/babylm_100M/aochildes.train")
DIR = "/home/km55359/rawdata/babylm_data/babylm_100M/"
corpus = {}

# print(list(glob.glob(DIR)))

for file in glob.glob(f"{DIR}/*.train"):
    corpus_name = re.split(r"(/|.train)", file)[-3]
    sents = read_babylm(f"{DIR}/{corpus_name}.train")
    corpus[corpus_name] = sents

In [27]:
{corpus_name: len(corpus[corpus_name]) for corpus_name in corpus.keys()}

{'open_subtitles': 5440000,
 'qed': 960000,
 'bnc_spoken': 848989,
 'wikipedia': 255000,
 'gutenberg': 1130000,
 'aochildes': 763989,
 'simple_wikipedia': 686554,
 'children_stories': 77760,
 'cbt': 263519,
 'switchboard': 161740}

In [98]:
target_sentences = defaultdict(list)
for corpus_name, sents in corpus.items():
    if corpus_name == "children_stories":
        for doc in tqdm(
            nlp.pipe(sents, disable=["lemmatizer", "ner"], n_process=32, batch_size=10000)
        ):
            found, materials = singular_plural(doc)
            if found:
                target_sentences[corpus_name].append((doc.text, materials))

77760it [04:12, 307.87it/s] 


In [100]:
target_sentences["children_stories"][10:20]

[('The sheep was as yet all raw and bloody; but he liked it the better for that. He sniffed about to the right and left, saying:',
  [['was', 'sheep']]),
 ('After Prince Houssain had run through that division, street by street, his thoughts fully employed on the riches he had seen, he was very much tired, which a merchant perceiving, civilly invited him to sit down in his shop, and he accepted; but had not been sat down long before he saw a crier pass by with a piece of tapestry on his arm, about six feet square, and cried at thirty purses. The Prince called to the crier, and asked to see the tapestry, which seemed to him to be valued at an exorbitant price, not only for the size of it, but the meanness of the stuff; when he had examined it well, he told the crier that he could not comprehend how so small a piece of tapestry, and of so indifferent appearance, could be set at so high a price.',
  [['was', 'thoughts']]),
 ('“Sir,” said the crier, giving it into his hand, “if you look at 

In [84]:
# target_sentences = []``
# for sentence in corpus:
#     doc = nlp(sentence)
#     if singular_plural(doc):
#         target_sentences.append(sentence)
for token in doc:
    print(token.text, token.pos_, token.tag_, [child.text for child in token.children])

Five  CD []
people  NNS ['Five']
is  VBZ ['people', 'number', '.']
a  DT []
not  RB []
a  DT []
huge  JJ []
number  NN ['a', 'not', 'a', 'huge', 'of']
of  IN ['attendees']
attendees  NNS []
.  . []


In [96]:
target_sentences['children_stories']

['“I am waited for in Egypt,” answered the Swallow.  “To-morrow my friends will fly up to the Second Cataract.  The river-horse couches there among the bulrushes, and on a great granite throne sits the God Memnon.  All night long he watches the stars, and when the morning star shines he utters one cry of joy, and then he is silent.  At noon the yellow lions come down to the water’s edge to drink.  They have eyes like green beryls, and their roar is louder than the roar of the cataract.”',
 'There was once a Prince who wished to marry a Princess; but then she must be a real Princess. He travelled all over the world in hopes of finding such a lady; but there was always something wrong. Princesses he found in plenty; but whether they were real Princesses it was impossible for him to decide, for now one thing, now another, seemed to him not quite right about the ladies. At last he returned to his palace quite cast down, because he wished so much to have a real Princess for his wife.',
 "“I

In [None]:
target_sentences

["and here's some apples .",
 'fun facts is a book .',
 "it's time those guys yikes they better be getting in the bathtub !",
 "theo where's your toes ?",
 'yes the pretend person is not going to find the blocks is he .',
 "theo where's the fishies .",
 'here comes more tummy kisses .',
 "carrots keeps warm and dry in kim's coat .",
 "avery's got dolls doesn't she .",
 "the bunny has big ears doesn't he .",
 "yay father whistles sometimes doesn't he !",
 "where's your teeth ?",
 "zachary likes police boats doesn't he .",
 "where's the fishies ?",
 'guys guys guys guys .',
 "where's his eyes ?",
 "where's mama's eyes ?",
 'bears bears bears .',
 "where's nicholas ?",
 "where's the cows ?",
 "where's your shoes ?",
 "where's roy's shoes ?",
 "where's the shoes ?",
 "where's the monkey's eyes ?",
 "where's the monkey's eyes ?",
 "oh here's some pictures of babies that we know .",
 "here's some apples and blueberries mm .",
 "where's the chickie's eyes ?",
 "where's bert's eyes ?",
 "where