In [11]:
import glob
import re

import pandas as pd
# import tqdm

from collections import defaultdict
from minicons.utils import find_pattern
from spacy.lang.en import English

nlp = English()
tokenizer = nlp.tokenizer

In [2]:
cats = ['DT', 'ADJ', 'NUMERAL', 'NOUN', "ADV"]
pos2cat = {
    'DT': 'DT',
    'JJ': 'ADJ',
    'JJR': 'ADJ',
    'JJS': 'ADJ',
    'CD': 'NUMERAL',
    'NNS': 'NOUN',
    'NNPS': 'NOUN',
    'RB': 'ADV',
    'CC': 'CC',
    'TO': 'TO'
}


def tokenize(string):
    return [t.text for t in tokenizer(string)]


def read_file(path):
    """TODO: make read all"""
    return [i.strip() for i in open(path, encoding="utf-8").readlines()]


def write_lines(lst, path):
    with open(path, "w") as f:
        for entry in lst:
            f.write(f"{entry}\n")


AANN_REGEX = r"\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS))+"


def detect_aann(postag):
    return re.search(AANN_REGEX, postag)

def aann_meta(token_seq, const_pattern):
    form_elements = {
        'DT': [],
        'ADJ': [],
        'NUMERAL': [],
        'NOUN': [],
        'ADV': []
    }
    for token, element in zip(token_seq, const_pattern.split()):
        category = pos2cat[element]
        if category in cats:
            form_elements[category].append(token)

    fe_strings = {k:" & ".join(v) for k,v in form_elements.items()}

    # return form_elements
    return fe_strings

In [63]:
all_aanns = []
ALL_AANNS = []

patterns = set()
PATTERNS = set()

full_aann_data = []
FULL_AANN_DATA = []

for file in glob.glob("/home/km55359/rawdata/babylm_data/postags_100M/*.train"):
    corpus = re.split(r"(/|.train)", file)[-3]
    pos = read_file(file)
    sents = read_file(f"/home/km55359/rawdata/babylm_data/babylm_100M/{corpus}.train")

    aanns = []
    for i, seq in enumerate(pos):
        # if re.search(r'\bDT JJ CD (NNS|NNPS)', seq):
        searched = detect_aann(seq)
        if searched:
            aanns.append((i, seq, searched.span()))

    if len(aanns) == 0:
        print(f"No (permissive) AANNs found in {corpus}")
        break

    construction_forms = []
    construction_ids = []

    CONSTRUCTION_FORMS = []
    CONSTRUCTION_IDS = []

    for entry in aanns:
        idx, pos_seq, span = entry
        construction_pattern = pos_seq[span[0] : span[1]]
        construction_pattern_span = find_pattern(
            construction_pattern.split(), pos_seq.split()
        )

        tokens = tokenize(sents[idx])
        if tokens == []:
            pass
        else:
            extracted_token_seq = tokens[
                construction_pattern_span[0] : construction_pattern_span[1]
            ]
            if extracted_token_seq[0].lower().startswith("-a"):
                extracted_token_seq[0] = (
                    extracted_token_seq[0].replace("-a", "a").replace("-A", "A")
                )
            if extracted_token_seq[0].lower() in ["a", "an", "-a", "another"]:
                construction_form = " ".join(extracted_token_seq)

                construction_forms.append(construction_form)
                construction_ids.append(idx)

                all_aanns.append(construction_form)

                patterns.add(construction_pattern)

                construction_elements = aann_meta(
                    extracted_token_seq, construction_pattern
                )
                construction_elements["sentence"] = sents[idx]
                construction_elements["sentence_idx"] = idx
                construction_elements["pattern"] = construction_pattern
                construction_elements["source"] = corpus
                construction_elements["construction"] = construction_form

                full_aann_data.append(construction_elements)

            # for all cases, regardless of the first token
            CONSTRUCTION_FORM = " ".join(extracted_token_seq)

            CONSTRUCTION_FORMS.append(construction_form)
            CONSTRUCTION_IDS.append(idx)

            ALL_AANNS.append(CONSTRUCTION_FORM)

            PATTERNS.add(construction_pattern)

            CONSTRUCTION_ELEMENTS = aann_meta(extracted_token_seq, construction_pattern)
            CONSTRUCTION_ELEMENTS["sentence"] = sents[idx]
            CONSTRUCTION_ELEMENTS["sentence_idx"] = idx
            CONSTRUCTION_ELEMENTS["pattern"] = construction_pattern
            CONSTRUCTION_ELEMENTS["source"] = corpus
            CONSTRUCTION_ELEMENTS["construction"] = CONSTRUCTION_FORM

            FULL_AANN_DATA.append(CONSTRUCTION_ELEMENTS)

    if len(CONSTRUCTION_FORMS) == 0:
        print(f"No AANNs found in {corpus}")
        break

    write_lines(
        construction_ids,
        f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_ids_{corpus}_train.txt",
    )

    write_lines(
        construction_forms,
        f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_forms_{corpus}_train.txt",
    )

    write_lines(
        CONSTRUCTION_IDS,
        f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_all_det_ids_{corpus}_train.txt",
    )

    write_lines(
        CONSTRUCTION_FORMS,
        f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_all_det_forms_{corpus}_train.txt",
    )

    cts = len(construction_forms)
    frac = cts / len(sents)
    print(f"{corpus} counts: {cts} pct: {frac}")

    print("\n\nWith all determiners:\n")

    cts = len(CONSTRUCTION_FORMS)
    frac = cts / len(sents)
    print(f"{corpus} counts: {cts} pct: {frac}")

write_lines(
    all_aanns, f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_forms_all_train.txt"
)

write_lines(
    ALL_AANNS, f"/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_all_det_forms_all_train.txt"
)

unique_aanns = list(set(all_aanns))
write_lines(
    unique_aanns,
    f"/home/km55359/rawdata/babylm_data/babylm_100M/aann_forms_all-unique_train.txt",
)

unique_aanns = list(set(ALL_AANNS))
write_lines(
    unique_aanns,
    f"/home/km55359/rawdata/babylm_data/babylm_100M/aann_all_det_forms_all-unique_train.txt",
)

cols = [
    "source",
    "sentence",
    "sentence_idx",
    "construction",
    "pattern",
    "DT",
    "ADJ",
    "NUMERAL",
    "NOUN",
    "ADV",
]
df = pd.DataFrame(full_aann_data)[cols]

df.reset_index()
df.to_csv("/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_data.csv", index=False)
df.to_csv("../data/baby_aann_data.csv", index=False)


df = pd.DataFrame(FULL_AANN_DATA)[cols]

df.reset_index()
df.to_csv("/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_all_det_data.csv", index=False)
df.to_csv("../data/babylm_aann_all_det_data.csv", index=False)

open_subtitles counts: 287 pct: 5.275735294117647e-05


With all determiners:

open_subtitles counts: 2797 pct: 0.0005141544117647058
qed counts: 112 pct: 0.00011666666666666667


With all determiners:

qed counts: 1623 pct: 0.001690625
bnc_spoken counts: 93 pct: 0.00010954205531520432


With all determiners:

bnc_spoken counts: 1072 pct: 0.0012626783150311723
wikipedia counts: 237 pct: 0.0009294117647058824


With all determiners:

wikipedia counts: 1994 pct: 0.007819607843137255
gutenberg counts: 83 pct: 7.345132743362832e-05


With all determiners:

gutenberg counts: 564 pct: 0.0004991150442477876
aochildes counts: 4 pct: 5.2356774770317375e-06


With all determiners:

aochildes counts: 30 pct: 3.926758107773803e-05
simple_wikipedia counts: 156 pct: 0.00022722174803438622


With all determiners:

simple_wikipedia counts: 1959 pct: 0.0028533807974318115
children_stories counts: 21 pct: 0.0002700617283950617


With all determiners:

children_stories counts: 110 pct: 0.0014146090534979

In [64]:
full_aann_data[:10]

[{'DT': 'a',
  'ADJ': 'few',
  'NUMERAL': 'thousand',
  'NOUN': 'dollars',
  'ADV': '',
  'sentence': "We're talking a few thousand dollars!",
  'sentence_idx': 16438,
  'pattern': 'DT JJ CD NNS',
  'source': 'open_subtitles',
  'construction': 'a few thousand dollars'},
 {'DT': 'a',
  'ADJ': 'measly',
  'NUMERAL': 'thousand',
  'NOUN': 'pounds',
  'ADV': '',
  'sentence': "Mister, you wouldn't want to own a dog that couldn't pull a measly thousand pounds.",
  'sentence_idx': 50432,
  'pattern': 'DT JJ CD NNS',
  'source': 'open_subtitles',
  'construction': 'a measly thousand pounds'},
 {'DT': 'a',
  'ADJ': 'fine',
  'NUMERAL': 'eighteen',
  'NOUN': 'months',
  'ADV': '',
  'sentence': 'I see .. all in all, a fine eighteen months.',
  'sentence_idx': 53752,
  'pattern': 'DT JJ CD NNS',
  'source': 'open_subtitles',
  'construction': 'a fine eighteen months'},
 {'DT': 'a',
  'ADJ': 'massive',
  'NUMERAL': '2.5',
  'NOUN': 'kilograms',
  'ADV': '',
  'sentence': "And to blow up the bath

In [57]:
for aann in full_aann_data:
    if '-a' in aann['construction'].lower():
        print(aann)

In [58]:
len(all_aanns), len(ALL_AANNS)

(1031, 10461)

In [29]:
articles = defaultdict(list)
for const in all_aanns:
    # const = const.lower()
    article = const.split(" ")[0]
    articles[article].append(const)
articles = dict(articles)

In [30]:
articles.keys()

dict_keys(['the', 'a', 'THE', 'The', 'these', 'those', 'an', 'An', 'each', 'any', 'that', 'A', 'another', 'That', 'this', 'THESE', 'These', 'Another', 'all', '-A', 'every', 'Those', 'This', 'THOSE', 'no', 'forthe', 'thesearethe', 'tbe', 'some', 'Every', 'All', "'", 'Each'])

In [31]:
article_lens = {k: len(v) for k, v in articles.items()}
sorted_articles = sorted(article_lens.items(), key=lambda x: x[1], reverse=True)

In [32]:
sorted_articles

[('the', 8287),
 ('a', 785),
 ('The', 768),
 ('an', 185),
 ('these', 112),
 ('those', 64),
 ('THE', 45),
 ('A', 39),
 ('this', 31),
 ('These', 26),
 ('each', 23),
 ('An', 21),
 ('that', 18),
 ('any', 11),
 ('another', 6),
 ('every', 6),
 ('all', 5),
 ('Those', 5),
 ('This', 3),
 ('no', 3),
 ('forthe', 3),
 ('some', 3),
 ('THESE', 2),
 ('That', 1),
 ('Another', 1),
 ('-A', 1),
 ('THOSE', 1),
 ('thesearethe', 1),
 ('tbe', 1),
 ('Every', 1),
 ('All', 1),
 ("'", 1),
 ('Each', 1)]

In [28]:
articles['this']

['this vulnerable 24 hours',
 'this past six months',
 'this first four courses',
 'this negative 26.03 m',
 'this first four bikes',
 'this last four days',
 'this last ten minutes',
 'this 1st 5 years',
 'this past 24 hours',
 'this next one--0 times',
 'this final two weeks',
 'this first two instructions',
 'this entire 42,000 miles',
 'this past six months',
 'this last fifteen years',
 'this magic three hours',
 'this first two years',
 'this last three or four years',
 'this last four weeks',
 'this next six months',
 'this last fifteen years',
 'this extra few hundred pounds',
 'this last two years',
 'this last nine months',
 'this last six months',
 'this extra twenty hectares',
 'this last three quarters',
 'this past ten years',
 'this last six months',
 'this few hundred dollars',
 'this last three years',
 'this bad seventeen years',
 'this last six months',
 'this splendid six feet']