In [23]:
import glob
import re
import spacy

from collections import defaultdict
from utils import read_babylm
from tqdm import tqdm

In [76]:
nlp = spacy.load("en_core_web_sm")

sentences = [
    "Five days is a long time to wait for a response.",
    "Seven weeks of thesis writing takes a toll on one's body.",
    "A year is a long time to wait.",
    "Five people is a not a huge number of attendees.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Five people is a not a huge number of attendees.",
]

In [97]:
def singular_plural(parsed):
    found = False
    materials = []
    for token in parsed:
        if "VB" in token.tag_:
            morph = token.morph.to_dict()
            if "Number" in morph.keys():
                if morph['Number'] == 'Sing':
                    for child in token.children:
                        child_morph = child.morph.to_dict()
                        # check if child is noun
                        if child.pos_ == 'NOUN':
                            if child.dep_ == 'nsubj' and child_morph['Number'] == 'Plur':
                                materials.append([token.text, child.text])
                                found = True
                                break
    return found, materials

# only do pos tags and dependency parses
# for sentence in sentences:
#     doc = nlp(sentence)
#     if singular_plural(doc):
#         print(sentence)
docs = []
targets = []
for doc in tqdm(nlp.pipe(sentences, disable=["lemmatizer", "ner"], n_process=32, batch_size=10000)):
    docs.append(doc.text)
    if singular_plural(doc):
        targets.append(doc.text)
    

Five days is a long time to wait for a response.
Seven weeks of thesis writing takes a toll on one's body.
A year is a long time to wait.
Five people is a not a huge number of attendees.
Revenue exceeded twelve billion dollars, with a loss of $1b.
Net income was $9.4 million compared to the prior year of $2.7 million.
Five people is a not a huge number of attendees.


In [26]:
# corpus = read_babylm(f"/home/km55359/rawdata/babylm_data/babylm_100M/aochildes.train")
DIR = "/home/km55359/rawdata/babylm_data/babylm_100M/"
corpus = {}

# print(list(glob.glob(DIR)))

for file in glob.glob(f"{DIR}/*.train"):
    corpus_name = re.split(r"(/|.train)", file)[-3]
    sents = read_babylm(f"{DIR}/{corpus_name}.train")
    corpus[corpus_name] = sents

In [27]:
{corpus_name: len(corpus[corpus_name]) for corpus_name in corpus.keys()}

{'open_subtitles': 5440000,
 'qed': 960000,
 'bnc_spoken': 848989,
 'wikipedia': 255000,
 'gutenberg': 1130000,
 'aochildes': 763989,
 'simple_wikipedia': 686554,
 'children_stories': 77760,
 'cbt': 263519,
 'switchboard': 161740}

In [98]:
target_sentences = defaultdict(list)
for corpus_name, sents in corpus.items():
    if corpus_name == "children_stories":
        for doc in tqdm(
            nlp.pipe(sents, disable=["lemmatizer", "ner"], n_process=32, batch_size=10000)
        ):
            found, materials = singular_plural(doc)
            if found:
                target_sentences[corpus_name].append((doc.text, materials))

77760it [04:12, 307.87it/s] 


In [100]:
target_sentences["children_stories"][10:20]

[('The sheep was as yet all raw and bloody; but he liked it the better for that. He sniffed about to the right and left, saying:',
  [['was', 'sheep']]),
 ('After Prince Houssain had run through that division, street by street, his thoughts fully employed on the riches he had seen, he was very much tired, which a merchant perceiving, civilly invited him to sit down in his shop, and he accepted; but had not been sat down long before he saw a crier pass by with a piece of tapestry on his arm, about six feet square, and cried at thirty purses. The Prince called to the crier, and asked to see the tapestry, which seemed to him to be valued at an exorbitant price, not only for the size of it, but the meanness of the stuff; when he had examined it well, he told the crier that he could not comprehend how so small a piece of tapestry, and of so indifferent appearance, could be set at so high a price.',
  [['was', 'thoughts']]),
 ('“Sir,” said the crier, giving it into his hand, “if you look at 

In [84]:
# target_sentences = []``
# for sentence in corpus:
#     doc = nlp(sentence)
#     if singular_plural(doc):
#         target_sentences.append(sentence)
for token in doc:
    print(token.text, token.pos_, token.tag_, [child.text for child in token.children])

Five  CD []
people  NNS ['Five']
is  VBZ ['people', 'number', '.']
a  DT []
not  RB []
a  DT []
huge  JJ []
number  NN ['a', 'not', 'a', 'huge', 'of']
of  IN ['attendees']
attendees  NNS []
.  . []


In [96]:
target_sentences['children_stories']

['“I am waited for in Egypt,” answered the Swallow.  “To-morrow my friends will fly up to the Second Cataract.  The river-horse couches there among the bulrushes, and on a great granite throne sits the God Memnon.  All night long he watches the stars, and when the morning star shines he utters one cry of joy, and then he is silent.  At noon the yellow lions come down to the water’s edge to drink.  They have eyes like green beryls, and their roar is louder than the roar of the cataract.”',
 'There was once a Prince who wished to marry a Princess; but then she must be a real Princess. He travelled all over the world in hopes of finding such a lady; but there was always something wrong. Princesses he found in plenty; but whether they were real Princesses it was impossible for him to decide, for now one thing, now another, seemed to him not quite right about the ladies. At last he returned to his palace quite cast down, because he wished so much to have a real Princess for his wife.',
 "“I

In [None]:
target_sentences

["and here's some apples .",
 'fun facts is a book .',
 "it's time those guys yikes they better be getting in the bathtub !",
 "theo where's your toes ?",
 'yes the pretend person is not going to find the blocks is he .',
 "theo where's the fishies .",
 'here comes more tummy kisses .',
 "carrots keeps warm and dry in kim's coat .",
 "avery's got dolls doesn't she .",
 "the bunny has big ears doesn't he .",
 "yay father whistles sometimes doesn't he !",
 "where's your teeth ?",
 "zachary likes police boats doesn't he .",
 "where's the fishies ?",
 'guys guys guys guys .',
 "where's his eyes ?",
 "where's mama's eyes ?",
 'bears bears bears .',
 "where's nicholas ?",
 "where's the cows ?",
 "where's your shoes ?",
 "where's roy's shoes ?",
 "where's the shoes ?",
 "where's the monkey's eyes ?",
 "where's the monkey's eyes ?",
 "oh here's some pictures of babies that we know .",
 "here's some apples and blueberries mm .",
 "where's the chickie's eyes ?",
 "where's bert's eyes ?",
 "where