In [1]:
import glob
import re
import spacy

import pandas as pd

from collections import defaultdict
from minicons.utils import find_pattern
from spacy.lang.en import English
from tqdm import tqdm

import utils
import config
import minicons.utils as mu

In [2]:
spacy_model = spacy.load("en_core_web_sm")

In [3]:
nlp = English()
tokenizer = nlp.tokenizer

In [4]:
sentence = "the family spent a beautiful dozen days for you to get it."
def text2parse(sentence):
    # text = []
    deps = []
    poses = []
    tags = []
    for token in spacy_model(sentence):
        # token.text, token.dep_, token.pos_, token.tag_
        # parse.append(f"{token.text}/{token.dep_}/{token.pos_}/{token.tag_}")
        deps.append(token.dep_)
        poses.append(token.pos_)
        tags.append(token.tag_)
    return " ".join(deps), " ".join(poses), " ".join(tags)

In [8]:
text2parse('I struggled through them, I somehow felt I had a few dozen eggs than when I started.')

('nsubj ccomp prep pobj punct nsubj advmod ROOT nsubj ccomp quantmod amod nummod dobj prep advmod nsubj pcomp punct',
 'PRON VERB ADP PRON PUNCT PRON ADV VERB PRON VERB DET ADJ NOUN NOUN ADP SCONJ PRON VERB PUNCT',
 'PRP VBD IN PRP , PRP RB VBD PRP VBD DT JJ NN NNS IN WRB PRP VBD .')

In [9]:
ADVANCED_REGEX = r'\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD|JJ|JJR|JJS|CD\sCD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'
ULTRA_REGEX = r'\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD|JJ|JJR|JJS|NN|CD\sCD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'
AANN_REGEX = r"\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS))+"

In [10]:
cats = ['DT', 'ADJ', 'NUMERAL', 'NOUN', "ADV"]

pos2cat = {
    'DT': 'DT',
    'JJ': 'ADJ',
    'JJR': 'ADJ',
    'JJS': 'ADJ',
    'CD': 'NUMERAL',
    'NNS': 'NOUN',
    'NNPS': 'NOUN',
    'NN': 'NOUN',
    'IN': 'NOUN',
    'RB': 'ADV',
    'CC': 'CC',
    'TO': 'TO'
}

def detect_aann_advanced(sequence):
    return re.search(ADVANCED_REGEX, sequence)

def detect_aann_basic(sequence):
    return re.search(AANN_REGEX, sequence)

def detect_aann_ultra(sequence):
    return re.search(ULTRA_REGEX, sequence)

def tokenize(string):
    return [t.text for t in tokenizer(string)]


def read_file(path):
    """TODO: make read all"""
    return [i.strip() for i in open(path, encoding="utf-8").readlines()]

def aann_meta(token_seq, const_pattern):
    form_elements = {
        'DT': [],
        'ADJ': [],
        'NUMERAL': [],
        'NOUN': [],
        'ADV': []
    }
    for token, element in zip(token_seq, const_pattern.split()):
        category = pos2cat[element]
        if category in cats:
            form_elements[category].append(token)

    fe_strings = {k:" & ".join(v) for k,v in form_elements.items()}

    # return form_elements
    return fe_strings

def store_aann_spans(pos_seq):
    aanns = []
    for i, seq in enumerate(pos_seq):
        searched_advanced = detect_aann_advanced(seq)
        searched_basic = detect_aann_basic(seq)
        searched_ultra = detect_aann_ultra(seq)
        if searched_basic:
            aanns.append((i, seq, searched_basic.span()))
        elif searched_advanced:
            aanns.append((i, seq, searched_advanced.span()))
        elif searched_ultra:
            aanns.append((i, seq, searched_ultra.span()))
    return aanns

def store_aanns(sents, pos_seq, indefinite_articles=True, corpus=None):
    full_aann_data = []
    
    # get spans
    aanns = store_aann_spans(pos_seq)

    # given spans, extract sentences and infor relevant for parsing.
    for entry in aanns:
        idx, pos_seq, span = entry
        construction_pattern = pos_seq[span[0] : span[1]]
        construction_pattern_span = find_pattern(
            construction_pattern.split(), pos_seq.split()
        )

        tokens = tokenize(sents[idx])
        if tokens == []:
            pass
        else:
            extracted_token_seq = tokens[
                construction_pattern_span[0] : construction_pattern_span[1]
            ]
            if extracted_token_seq[0].lower().startswith("-a"):
                extracted_token_seq[0] = (
                    extracted_token_seq[0].replace("-a", "a").replace("-A", "A")
                )
            if indefinite_articles:
                condition = extracted_token_seq[0].lower() in ["a", "an", "-a", "another"]
            else:
                condition = True
            if condition:
                construction_form = " ".join(extracted_token_seq)

                construction_forms.append(construction_form)
                construction_ids.append(idx)

                all_aanns.append(construction_form)

                patterns.add(construction_pattern)

                construction_elements = aann_meta(
                    extracted_token_seq, construction_pattern
                )
                construction_elements["sentence"] = sents[idx]
                construction_elements["sentence_idx"] = idx
                construction_elements["pattern"] = construction_pattern
                construction_elements["source"] = corpus
                construction_elements["construction"] = construction_form
                
                full_aann_data.append(construction_elements)

    return full_aann_data

In [29]:
full_aann_data = []
all_aanns = []
patterns = set()

all_sents = []
pos_seqs = []

for file in glob.glob("/home/km55359/rawdata/babylm_data/postags_100M/*.train"):
    corpus = re.split(r"(/|.train)", file)[-3]
    pos = read_file(file)
    sents = read_file(f"/home/km55359/rawdata/babylm_data/babylm_100M/{corpus}.train")
    
    pos_seqs.extend(pos)
    all_sents.extend(sents)

    aanns = []
    for i, seq in enumerate(pos):
        # if re.search(r'\bDT JJ CD (NNS|NNPS)', seq):
        searched = detect_adv_aann(seq)
        searched_basic = detect_aann_basic(seq)
        if searched_basic:
            aanns.append((i, seq, searched_basic.span()))
        elif searched:
            aanns.append((i, seq, searched.span()))

    construction_forms = []
    construction_ids = []

    CONSTRUCTION_FORMS = []
    CONSTRUCTION_IDS = []

    for entry in aanns:
        idx, pos_seq, span = entry
        construction_pattern = pos_seq[span[0] : span[1]]
        construction_pattern_span = find_pattern(
            construction_pattern.split(), pos_seq.split()
        )

        tokens = tokenize(sents[idx])
        if tokens == []:
            pass
        else:
            extracted_token_seq = tokens[
                construction_pattern_span[0] : construction_pattern_span[1]
            ]
            if extracted_token_seq[0].lower().startswith("-a"):
                extracted_token_seq[0] = (
                    extracted_token_seq[0].replace("-a", "a").replace("-A", "A")
                )
            if extracted_token_seq[0].lower() in ["a", "an", "-a", "another"]:
                construction_form = " ".join(extracted_token_seq)

                construction_forms.append(construction_form)
                construction_ids.append(idx)

                all_aanns.append(construction_form)

                patterns.add(construction_pattern)

                construction_elements = aann_meta(
                    extracted_token_seq, construction_pattern
                )
                construction_elements["sentence"] = sents[idx]
                construction_elements["sentence_idx"] = idx
                construction_elements["pattern"] = construction_pattern
                construction_elements["source"] = corpus
                construction_elements["construction"] = construction_form
                
                full_aann_data.append(construction_elements)

In [30]:
extracted_token_seq

['these', 'little', 'teeny', 'weeny', 'silver', 'spatulas']

In [31]:
construction_pattern

'DT JJ JJ JJ NN NNS'

In [33]:
len(full_aann_data)

5136

In [34]:
for i, entry in enumerate(full_aann_data):
    if len(entry['pattern'].split(" ")) != len(entry['construction'].split(" ")):
        print(i)

In [35]:
# find position of target word from behind
# re.search("more", full_aann_data[7]['construction'])
# list(re.finditer("more", "a more more of things"))[-1].span
# for word in
full_aann_data[7]['construction'].split()

['a', 'few', 'more', 'things']

In [36]:
[i for i, w in enumerate(full_aann_data[7]['construction'].split()) if w == "more"][-1]

2

In [37]:
non_numerals = []
numerals = []
for entry in full_aann_data:
    if entry['NUMERAL'] == '':
        non_numerals.append(entry)
    else:
        numerals.append(entry)

In [38]:
# these are the original aanns + ones with multi-word expressions for nouns. Let's take a look at those?

len(numerals), len(non_numerals)

(1139, 3997)

In [39]:
babylm_regex = utils.read_csv_dict("../data/baby_aann_data.csv")
babylm_constructions = [b['construction'] for b in babylm_regex]
numeral_constructions = [b['construction'] for b in numerals]

In [41]:
for b in babylm_regex:
    if b['construction'] not in numeral_constructions:
        print(b)

In [42]:
captured = []
not_captured = []
for n in numerals:
    if n['construction'] in babylm_constructions:
        captured.append(n)
    else:
        not_captured.append(n)

In [43]:
len(captured), len(not_captured)

(1038, 101)

In [44]:
for nc in numerals:
    if u"\xa0" in nc['construction']:
        print(nc['construction'], nc['pattern'])

a hulking 261   pounds DT JJ CD CD NNS
a few 37   mm weapons DT JJ CD CD NN NNS
a further 7   km DT JJ CD CD NNS
a further 20–24   cm DT JJ CD CD NNS


In [32]:
utils.parse_instance(not_captured[71])

AANN(article='a', adjective='further', numeral='km', noun='')

In [31]:
for i, nc in enumerate(not_captured):
    print(i, nc['construction'])

0 an astonishing 26 million shells
1 an astonishing 340 million marks-
2 a mere 1 8 months
3 an official 202 1/4 pounds
4 A staggering 100 billion gallons
5 A staggering 100 billion gallons
6 A staggering 375 billion litres
7 Another few hundred million years
8 a big nine inch nails
9 a possible one hundred points
10 a full 1 5 seconds
11 a few hundred million dollars
12 a few hundred millions years
13 a few thousand NAND gates
14 a glorified seven hundred people
15 a resounding 83 % of people
16 a whopping six hundred and seventy tons
17 an extra two thousand years
18 a few hundred thousand years
19 a few five ten years
20 a couple hundred thousand dollars
21 a few hundred million years
22 a mere 15,000 year humans
23 an extra eleven million pounds
24 an extra twenty eight days
25 a further seventeen million road users
26 a further one hundred and eight people
27 a staggering fifty billion pounds
28 a further twenty thousand pounds
29 an additional twenty thousand pounds
30 an extra s

In [27]:
non_numeral_adjs = ['few', 'dozen', 'couple', 'several', 'many', 'more']
def verify_and_edit(entry):
    new_entry = entry.copy()
    if new_entry['NUMERAL'] == '':
        adjs = new_entry['ADJ'].split(" & ")
        final_adj = adjs[-1]
        if final_adj in non_numeral_adjs: # gotcha
            decomposed_construction = new_entry['construction'].split(" ")
            idx_in_decomp = [i for i, w in enumerate(decomposed_construction) if w == final_adj][-1]
            new_entry['ADJ'] = " & ".join(adjs[:-1])
            new_entry['NUMERAL'] = final_adj
            
            # replace value in position of numeral-adj in pattern to 'CD' so that the aann is parsed.
            new_pattern = new_entry['pattern'].split(" ")
            new_pattern[idx_in_decomp] = "CD"
            new_entry['pattern'] = " ".join(new_pattern)
    return new_entry

In [30]:
full_aann_data[7]

{'DT': 'a',
 'ADJ': 'few & more',
 'NUMERAL': '',
 'NOUN': 'things',
 'ADV': '',
 'sentence': "Well, maybe he's learned a few more things since then.",
 'sentence_idx': 31340,
 'pattern': 'DT JJ JJR NNS',
 'source': 'open_subtitles',
 'construction': 'a few more things'}

In [29]:
verify_and_edit(full_aann_data[7])

{'DT': 'a',
 'ADJ': 'few',
 'NUMERAL': 'more',
 'NOUN': 'things',
 'ADV': '',
 'sentence': "Well, maybe he's learned a few more things since then.",
 'sentence_idx': 31340,
 'pattern': 'DT JJ CD NNS',
 'source': 'open_subtitles',
 'construction': 'a few more things'}

In [86]:
for i, aa in enumerate(all_aanns):
    if 'dozen' in aa:
        print(i, aa)

In [74]:
full_aann_data

[{'DT': 'a',
  'ADJ': 'few & other',
  'NUMERAL': '',
  'NOUN': 'places',
  'ADV': '',
  'sentence': 'Tuddy ran the cabstand and the Bella Vista Pizzeria and a few other places for his brother, Paul, who was the boss over everybody in the neighborhood.',
  'sentence_idx': 10589,
  'pattern': 'DT JJ JJ NNS',
  'source': 'open_subtitles',
  'construction': 'a few other places'},
 {'DT': 'Another',
  'ADJ': 'fucking & few',
  'NUMERAL': '',
  'NOUN': 'minutes',
  'ADV': '',
  'sentence': 'Another fucking few minutes, he could be a stool.',
  'sentence_idx': 11042,
  'pattern': 'DT JJ JJ NNS',
  'source': 'open_subtitles',
  'construction': 'Another fucking few minutes'},
 {'DT': 'a',
  'ADJ': "little & fuckin & '",
  'NUMERAL': '',
  'NOUN': 'manners',
  'ADV': '',
  'sentence': "Teach this kid a little fuckin' manners!",
  'sentence_idx': 11650,
  'pattern': 'DT JJ JJ JJ NNS',
  'source': 'open_subtitles',
  'construction': "a little fuckin ' manners"},
 {'DT': 'a',
  'ADJ': 'little & fu

In [85]:
aanns

[(463,
  'RB IN , UH , DT JJ JJ NNS PRP VBD PRP$ JJ NN CC , PRP VBP , VBD PRP$ JJ NNS RB . CC MD , MD RB VB CC VB RB RB .',
  (13, 25)),
 (856, 'PRP VBP DT , UH , NN VBZ , VBZ VBN DT JJ JJ NN NNS', (35, 50)),
 (1697, 'RB RB PRP VBP VBN PRP VBN RP IN DT JJ CD NNS .', (32, 44)),
 (2542, 'DT , DT JJ , DT JJ JJ NNS MD RB , MD RB VB RB .', (13, 25)),
 (3067, 'IN DT JJ JJ NN NNS CC CC NN NNS CC NNS IN DT .', (3, 18)),
 (3383, 'CC , UH , PRP VBD IN DT NN NN DT JJ JJ NNS IN PRP$ NN', (30, 42)),
 (3818,
  'IN WP PRP VBP IN JJ JJ NNS , PRP , DT NNP NN VBZ VBG TO VB TO VB DT RBR JJ NN IN NN NN NNS , UH , IN , IN NN IN DT JJ JJ NNS .',
  (112, 124)),
 (4041,
  'CC PRP VBP RB JJ DT JJ CD NNS IN VBG IN DT NN NN CC VBG RP',
  (17, 29)),
 (4387,
  'RB PRP MD VB IN DT JJ CD NNS , PRP VBP RB VBN DT NN JJR NNS IN RB RB .',
  (16, 28)),
 (4924,
  'UH JJS IN DT NN PRP , PRP VBD VBG RB IN DT NN NN CC VBG TO VB DT JJ JJ NNS',
  (62, 74)),
 (5523,
  'RB DT JJ JJ NNS CC NNS IN DT , PRP RB DT NN IN DT NN NNS VB

In [39]:
# do all regex aanns get captured by the classifier?
babylm_regex = utils.read_csv_dict("../data/baby_aann_data.csv")
babylm_classifier = utils.read_csv_dict("../data/babylm-analysis/detected_aann_sents.csv")
babylm_sents = utils.read_file("/home/km55359/rawdata/babylm_data/babylm_100M/sents/babylm_sents.txt")

clf_sents = [m['sentence'] for m in babylm_classifier]
regex_sents = [m['construction'].replace(" '", "'") for m in babylm_regex]

ct = 0
found = []
for i, s in enumerate(regex_sents):
    for ss in clf_sents:
        if s in ss:
            ct+=1
            found.append(i)
            break

In [4]:
for i, s in enumerate(regex_sents):
    if i not in found:
        print(s)

a merry one promises
A third and third three twelfths
a full 40 mm


In [5]:
len(babylm_regex)

1038

In [6]:
cats = ['DT', 'ADJ', 'NUMERAL', 'NOUN', "ADV"]
pos2cat = {
    'DT': 'DT',
    'JJ': 'ADJ',
    'JJR': 'ADJ',
    'JJS': 'ADJ',
    'CD': 'NUMERAL',
    'NNS': 'NOUN',
    'NNPS': 'NOUN',
    'RB': 'ADV',
    'CC': 'CC',
    'TO': 'TO'
}


def tokenize(string):
    return [t.text for t in tokenizer(string)]


def read_file(path):
    """TODO: make read all"""
    return [i.strip() for i in open(path, encoding="utf-8").readlines()]


def write_lines(lst, path):
    with open(path, "w") as f:
        for entry in lst:
            f.write(f"{entry}\n")


AANN_REGEX = r"\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS))+"
AANN_REGEX = r"\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS))+"

quants = ["few", "more", "many", "couple", "dozen"]


def detect_aann(postag):
    return re.search(AANN_REGEX, postag)

def aann_meta(token_seq, const_pattern):
    form_elements = {
        'DT': [],
        'ADJ': [],
        'NUMERAL': [],
        'NOUN': [],
        'ADV': []
    }
    for token, element in zip(token_seq, const_pattern.split()):
        category = pos2cat[element]
        if category in cats:
            form_elements[category].append(token)

    fe_strings = {k:" & ".join(v) for k,v in form_elements.items()}

    # return form_elements
    return fe_strings

In [23]:
# other regex
alt_regex = r"\b[Aa]\s(\w){2,}\s(few|couple|dozen|many|several|ample|more)\s+(?!of|\.)"
def detect_alt_aann(text):
    return re.search(alt_regex, text)

found_sents = []
for idx, s in enumerate(clf_sents):
    if detect_alt_aann(s):
        detected = detect_alt_aann(s)
        span = detected.span()
        found_sents.append((idx, span))

In [24]:
len(found_sents), found_sents

(988,
 [(6, (41, 52)),
  (37, (15, 27)),
  (58, (17, 30)),
  (59, (38, 51)),
  (61, (8, 21)),
  (62, (84, 97)),
  (66, (271, 284)),
  (92, (25, 38)),
  (93, (64, 77)),
  (99, (0, 12)),
  (108, (80, 93)),
  (115, (27, 40)),
  (117, (79, 92)),
  (120, (133, 146)),
  (123, (97, 110)),
  (127, (14, 26)),
  (130, (133, 146)),
  (133, (17, 30)),
  (141, (10, 23)),
  (148, (471, 484)),
  (149, (78, 91)),
  (158, (119, 132)),
  (163, (25, 38)),
  (166, (81, 94)),
  (177, (12, 24)),
  (189, (13, 24)),
  (194, (18, 30)),
  (197, (9, 22)),
  (206, (27, 40)),
  (207, (81, 94)),
  (217, (161, 174)),
  (224, (194, 207)),
  (248, (137, 150)),
  (252, (17, 30)),
  (290, (206, 219)),
  (296, (479, 492)),
  (297, (79, 92)),
  (303, (14, 27)),
  (310, (121, 134)),
  (318, (49, 62)),
  (320, (87, 100)),
  (322, (0, 13)),
  (329, (118, 130)),
  (331, (50, 63)),
  (333, (6, 19)),
  (345, (55, 68)),
  (348, (41, 54)),
  (349, (62, 75)),
  (350, (17, 30)),
  (352, (0, 13)),
  (355, (27, 40)),
  (356, (48, 61)

In [25]:
for i, span in found_sents:
    print(clf_sents[i])
    print("")

Yeah, well, I know I've not 'ad bath for a good few seasons, but there's no need to 'urt my feelin's.

So resting for a final few moments in stillness, in silence, full wakefulness, in full awareness, outside of time, as if you had nothing to do, no place to go.

“Little Hans had a great many friends, but the most devoted friend of all was big Hugh the Miller.

I know it is very generous of me, and a great many people would think me extremely foolish for parting with it, but I am not like the rest of the world.

“He has a great many good points, but for my own part I have a mother’s feelings, and I can never look at a confirmed bachelor without the tears coming into my eyes.”

She was one of those people who think that, if you say the same thing over and over a great many times, it becomes true in the end.

Her nephew, a merry boy, who was his aunt's darling, begged so long for these spectacles, that, at last, she lent him the treasure, after having informed him, with many exhortations

In [26]:
found_sents_all = []
for i, s in enumerate(babylm_sents):
    if detect_alt_aann(s):
        detected = detect_alt_aann(s)
        span = detected.span()
        found_sents_all.append((i, span))

In [27]:
len(found_sents_all)

6904

In [12]:
babylm_sents[80803][78:91]

'a great many '

In [28]:
in_clf = []
out_clf = []
for s, span in found_sents_all:
    if babylm_sents[s] in clf_sents:
        in_clf.append((s, span))
    else:
        out_clf.append((s, span))

In [29]:
len(in_clf), len(out_clf)

(988, 5916)

In [15]:
babylm_sents[404555]

'A very few minutes afterwards they were seated in the train and speeding'

In [31]:
for s_idx, (start, end) in out_clf:
    print(s_idx, babylm_sents[s_idx][start-3:end+10])

212 And especially in the court of public opinion, which, as I just mentioned, sometimes is a lot more important than what goes on in any courtroom.
249 I struggled through them, I somehow felt I had a few more IQ points than when I started.
294 In a little more detail, we could look at each of the partial products separately.
536 However, [this] starts a bit more complicated variation.
772 Now we got a few more minutes here, so lets do something a little, snicker
4418 Perhaps we should be a bit more careful about whingeing about losing staff to Regional Railways and
6982 is what you're raising a lot more issues here the undertow is unbelievable.
7853 The golden-eye favours a rather more restrained head-bob, but that is all his female needs to be convinced that he is the right partner for her.
8304 So this bower is a much more solid structure and much less easily destroyed than that of the satin.
9967 hell of a Iot more dangerous.
12846 of done Quickspend, you'd have been looking at fi

In [22]:
babylm_sents[917746]

'The merchant had a great many very beautiful horses, which lived in splendid stables and were taken the greatest possible care of.'

In [53]:
pos[256]

'UH . PRP VBP VBN PRP VBZ DT JJ JJ NN TO VB DT NN NN JJ RB IN WP MD VB VBZ PRP$ NN MD VB DT JJ NNS CC RB , UH , UH , UH , RB VB TO VB PRP'

In [52]:
sents[256]

"Um. I've found it's the only reliable way to keep a check book balanced actually because what will happen is my wife will write a few checks and then, well, uh, uh, not bother to total it"

In [30]:
found_sents

[68490, 93613, 134806]

In [14]:
all_aanns = []
ALL_AANNS = []

patterns = set()
PATTERNS = set()

full_aann_data = []
FULL_AANN_DATA = []

for file in glob.glob("/home/km55359/rawdata/babylm_data/postags_100M/*.train"):
    corpus = re.split(r"(/|.train)", file)[-3]
    pos = read_file(file)
    sents = read_file(f"/home/km55359/rawdata/babylm_data/babylm_100M/{corpus}.train")

    aanns = []
    for i, seq in enumerate(pos):
        # if re.search(r'\bDT JJ CD (NNS|NNPS)', seq):
        searched = detect_aann(seq)
        if searched:
            aanns.append((i, seq, searched.span()))

    if len(aanns) == 0:
        print(f"No (permissive) AANNs found in {corpus}")
        break

    construction_forms = []
    construction_ids = []

    CONSTRUCTION_FORMS = []
    CONSTRUCTION_IDS = []

    for entry in aanns:
        idx, pos_seq, span = entry
        construction_pattern = pos_seq[span[0] : span[1]]
        construction_pattern_span = find_pattern(
            construction_pattern.split(), pos_seq.split()
        )

        tokens = tokenize(sents[idx])
        if tokens == []:
            pass
        else:
            extracted_token_seq = tokens[
                construction_pattern_span[0] : construction_pattern_span[1]
            ]
            if extracted_token_seq[0].lower().startswith("-a"):
                extracted_token_seq[0] = (
                    extracted_token_seq[0].replace("-a", "a").replace("-A", "A")
                )
            if extracted_token_seq[0].lower() in ["a", "an", "-a", "another"]:
                construction_form = " ".join(extracted_token_seq)

                construction_forms.append(construction_form)
                construction_ids.append(idx)

                all_aanns.append(construction_form)

                patterns.add(construction_pattern)

                construction_elements = aann_meta(
                    extracted_token_seq, construction_pattern
                )
                construction_elements["sentence"] = sents[idx]
                construction_elements["sentence_idx"] = idx
                construction_elements["pattern"] = construction_pattern
                construction_elements["source"] = corpus
                construction_elements["construction"] = construction_form

                full_aann_data.append(construction_elements)

            # for all cases, regardless of the first token
            CONSTRUCTION_FORM = " ".join(extracted_token_seq)

            CONSTRUCTION_FORMS.append(CONSTRUCTION_FORM)
            CONSTRUCTION_IDS.append(idx)

            ALL_AANNS.append(CONSTRUCTION_FORM)

            PATTERNS.add(construction_pattern)

            CONSTRUCTION_ELEMENTS = aann_meta(extracted_token_seq, construction_pattern)
            CONSTRUCTION_ELEMENTS["sentence"] = sents[idx]
            CONSTRUCTION_ELEMENTS["sentence_idx"] = idx
            CONSTRUCTION_ELEMENTS["pattern"] = construction_pattern
            CONSTRUCTION_ELEMENTS["source"] = corpus
            CONSTRUCTION_ELEMENTS["construction"] = CONSTRUCTION_FORM

            FULL_AANN_DATA.append(CONSTRUCTION_ELEMENTS)

    if len(CONSTRUCTION_FORMS) == 0:
        print(f"No AANNs found in {corpus}")
        break

#     write_lines(
#         construction_ids,
#         f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_ids_{corpus}_train.txt",
#     )

#     write_lines(
#         construction_forms,
#         f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_forms_{corpus}_train.txt",
#     )

#     write_lines(
#         CONSTRUCTION_IDS,
#         f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_all_det_ids_{corpus}_train.txt",
#     )

#     write_lines(
#         CONSTRUCTION_FORMS,
#         f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_all_det_forms_{corpus}_train.txt",
#     )

#     cts = len(construction_forms)
#     frac = cts / len(sents)
#     print(f"{corpus} counts: {cts} pct: {frac}")

#     print("\n\nWith all determiners:\n")

#     cts = len(CONSTRUCTION_FORMS)
#     frac = cts / len(sents)
#     print(f"{corpus} counts: {cts} pct: {frac}")

# write_lines(
#     all_aanns, f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_forms_all_train.txt"
# )

# write_lines(
#     ALL_AANNS, f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_all_det_forms_all_train.txt"
# )

# unique_aanns = list(set(all_aanns))
# write_lines(
#     unique_aanns,
#     f"/home/km55359/rawdata/babylm_data/babylm_100M/aann_forms_all-unique_train.txt",
# )

# unique_aanns = list(set(ALL_AANNS))
# write_lines(
#     unique_aanns,
#     f"/home/km55359/rawdata/babylm_data/babylm_100M/aann_all_det_forms_all-unique_train.txt",
# )

# cols = [
#     "source",
#     "sentence",
#     "sentence_idx",
#     "construction",
#     "pattern",
#     "DT",
#     "ADJ",
#     "NUMERAL",
#     "NOUN",
#     "ADV",
# ]
# df = pd.DataFrame(full_aann_data)[cols]

# df.reset_index()
# df.to_csv("/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_data.csv", index=False)
# df.to_csv("../data/baby_aann_data.csv", index=False)


# df = pd.DataFrame(FULL_AANN_DATA)[cols]

# df.reset_index()
# df.to_csv("/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_all_det_data.csv", index=False)
# df.to_csv("../data/babylm_aann_all_det_data.csv", index=False)

In [6]:
len(full_aann_data)

1038

In [7]:
len(FULL_AANN_DATA)

10461