In [2]:
import glob
import re
import spacy
import unicodedata
import csv

import pandas as pd

from collections import defaultdict
from minicons.utils import find_pattern
from spacy.lang.en import English
from tqdm import tqdm

import utils
import config
import minicons.utils as mu

from nltk.tokenize import sent_tokenize

In [3]:
nlp = English()
tokenizer = nlp.tokenizer



In [4]:
# regexes
ADVANCED_REGEX = r'\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD|JJ|JJR|JJS|CD\sCD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'
ULTRA_REGEX = r'\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD|JJ|JJR|JJS|NN|CD\sCD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'
AANN_REGEX = r"\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS))+"

cats = ['DT', 'ADJ', 'NUMERAL', 'NOUN', "ADV"]
non_numerals = ['few', 'dozen', 'couple', 'several', 'many', 'more']

pos2cat = {
    'DT': 'DT',
    'JJ': 'ADJ',
    'JJR': 'ADJ',
    'JJS': 'ADJ',
    'CD': 'NUMERAL',
    'NNS': 'NOUN',
    'NNPS': 'NOUN',
    'NN': 'NOUN',
    'IN': 'NOUN',
    'RB': 'ADV',
    'CC': 'CC',
    'TO': 'TO'
}

def read_file(path):
    """TODO: make read all"""
    return [i.strip() for i in open(path, encoding="utf-8").readlines()]

def detect_aann_advanced(sequence):
    return re.search(ADVANCED_REGEX, sequence)

def detect_aann_basic(sequence):
    return re.search(AANN_REGEX, sequence)

def detect_aann_ultra(sequence):
    return re.search(ULTRA_REGEX, sequence)

def tokenize(string):
    return [t.text for t in tokenizer(string)]

def aann_meta(token_seq, const_pattern):
    form_elements = {
        'DT': [],
        'ADJ': [],
        'NUMERAL': [],
        'NOUN': [],
        'ADV': []
    }
    for token, element in zip(token_seq, const_pattern.split()):
        category = pos2cat[element]
        if category in cats:
            form_elements[category].append(token)

    fe_strings = {k:" & ".join(v) for k,v in form_elements.items()}

    # return form_elements
    return fe_strings

def is_ultra(seq):
    searched_advanced = detect_aann_advanced(seq)
    searched_basic = detect_aann_basic(seq)
    searched_ultra = detect_aann_ultra(seq)
    return searched_ultra and not searched_advanced

def store_aann_spans(pos_seq):
    aanns = []
    for i, seq in enumerate(pos_seq):
        searched_advanced = detect_aann_advanced(seq)
        searched_basic = detect_aann_basic(seq)
        searched_ultra = detect_aann_ultra(seq)
        if searched_basic:
            aanns.append((i, seq, searched_basic.span()))
        elif searched_advanced:
            aanns.append((i, seq, searched_advanced.span()))
        elif searched_ultra:
            aanns.append((i, seq, searched_ultra.span()))
    return aanns

def store_aanns(sents, pos_seq, corpus=None):
    full_aann_data = []
    
    # get spans
    aanns = store_aann_spans(pos_seq)

    # given spans, extract sentences and infor relevant for parsing.
    for entry in aanns:
        idx, pos_seq, span = entry
        construction_pattern = pos_seq[span[0] : span[1]]
        construction_pattern_span = find_pattern(
            construction_pattern.split(), pos_seq.split()
        )

        tokens = tokenize(sents[idx])
        if tokens == []:
            pass
        else:
            extracted_token_seq = tokens[
                construction_pattern_span[0] : construction_pattern_span[1]
            ]
            if extracted_token_seq[0].lower().startswith("-a"):
                extracted_token_seq[0] = (
                    extracted_token_seq[0].replace("-a", "a").replace("-A", "A")
                )
            # if indefinite_articles:
            #     condition = extracted_token_seq[0].lower() in ["a", "an", "-a", "another"]
            # else:
            #     condition = True
            # if condition:
            construction_form = " ".join(extracted_token_seq)

            construction_elements = aann_meta(
                extracted_token_seq, construction_pattern
            )
            construction_elements["sentence"] = sents[idx]
            construction_elements["sentence_idx"] = idx
            construction_elements["pattern"] = construction_pattern
            construction_elements["source"] = corpus
            construction_elements["construction"] = construction_form

            if is_ultra(construction_pattern):
                if construction_elements['NOUN'].split(" & ")[0] in non_numerals:
                    full_aann_data.append(construction_elements)
            else:
                full_aann_data.append(construction_elements)

    return full_aann_data

def indefinite_article_aanns(aann_list):
    indef_article_aanns = []
    for entry in aann_list:
        if entry['construction'].split()[0].lower() in ["a", "an", "-a", "another"]:
            indef_article_aanns.append(entry)
    return indef_article_aanns

# 5179 + 37

def verify_and_edit(entry):
    new_entry = entry.copy()
    if new_entry['NUMERAL'] == '':
        adjs = new_entry['ADJ'].split(" & ")
        final_adj = adjs[-1]
        nouns = new_entry['NOUN'].split(" & ")
        first_noun = nouns[0]
        # first check noun:
        if first_noun in non_numerals and len(nouns[1:]) >= 1:
            decomposed_construction = new_entry['construction'].split(" ")
            idx_in_decomp = [i for i, w in enumerate(decomposed_construction) if w == first_noun][-1]
            new_entry['NOUN'] = " & ".join(nouns[1:])
            new_entry['NUMERAL'] = first_noun

            # replace value in position of numeral-adj in pattern to 'CD' so that the aann is parsed.
            new_pattern = new_entry['pattern'].split(" ")
            new_pattern[idx_in_decomp] = "CD"
            new_entry['pattern'] = " ".join(new_pattern)
            
        if final_adj in non_numerals and len(adjs[:-1]) >= 1: # gotcha
            decomposed_construction = new_entry['construction'].split(" ")
            idx_in_decomp = [i for i, w in enumerate(decomposed_construction) if w == final_adj][-1]
            new_entry['ADJ'] = " & ".join(adjs[:-1])
            new_entry['NUMERAL'] = final_adj
            
            # replace value in position of numeral-adj in pattern to 'CD' so that the aann is parsed.
            new_pattern = new_entry['pattern'].split(" ")
            new_pattern[idx_in_decomp] = "CD"
            new_entry['pattern'] = " ".join(new_pattern)
    return new_entry

def prune_sentence(entry):
    new_entry = entry.copy()
    sentences = sent_tokenize(new_entry['sentence'])
    construction = new_entry['construction']
    
    # if there is more than one sentence in the input
    if len(sentences) > 1:
        # check which of them has the construction
        if construction in new_entry['sentence']:
            for s in sentences:
                if construction in s:
                    final_sentence = s
                    break
        else:
            for s in sentences:
                reconstructed = ' '.join(tokenize(s))
                if construction in reconstructed:
                    final_sentence = s
                    break
        # make change
        new_entry['sentence'] = final_sentence
        
    return new_entry

def find_sentence(target, source):
    idx = []
    for i, s in enumerate(source):
        if unicodedata.normalize("NFKD", s) == target:
            idx.append(i)
    return idx

In [5]:
# load sents and postags
sent_dir = "/home/km55359/rawdata/babylm_data/babylm_100M/sents/"
# sent_dir = "/Users/kanishka/rawdata/babylm-sents-and-postags/"
sents = utils.read_file(f"{sent_dir}/babylm_sents.txt")
postags = utils.read_file(f"{sent_dir}/postags.txt")

In [6]:
# find lines that contain the aanns we missed:

MISSED = ['a record 9 times',
 'a record 21 months',
 'a record eight times',
 'an extra 200 sit-ups',
 'a full  ninety minutes',
 'a good like six months',
 'a great two-and-a-half dates',
 'a club record 26 league games',
 'an estimated 100,000 climbers',
 'an extra hundred and... two pounds',
 'an amazing a hundred and thirty points',
 'an estimated 438,000 species of plants',
 'an additional twenty five million pounds',
 'an additional twenty five million pounds',
 'a further sixty-eight policemen, which has',
 'an additional twenty five million pounds is',
 'an estimated 90,750 hectares (224,000 acres',
 'a combined 73 goals for Celta, Albacete, Alavés',
 'a general 1-2" of snow. Some places saw as much as',
 'a career-high 266 yards and two scores as Northern Illinois',
 'a busy, busy few weeks for you, has']

print(len(MISSED))

missed_sents = []
missed_sents_idx = []

for i, s in enumerate(sents):
    for m in MISSED:
        if m in s:
            missed_sents.append(s)
            missed_sents_idx.append(i)


missed_postags = [postags[i] for i in missed_sents_idx]

21


In [7]:
len(missed_postags), len(missed_sents)

(25, 25)

In [8]:
missed_postag_patterns = [
    "DT NN NN CD NN NNS",
    "DT NN HYPH JJ CD NNS",
    "DT NN CD NNS",
    "DT JJ CD CD CD NNS",
    "DT VBN CD NNS",
    "DT JJ CD HYPH CD NNS",
    "DT JJ UH CD NNS",
    "DT JJ CD CC HYPH CD NNS",
    "DT JJ , JJ JJ NNS",
    "DT RB RB CD NNS"
]


exceptions = ["DT JJ DT CD CC CD NNS", "DT JJ CD HYPH CC HYPH DT HYPH NN NNS", "DT JJ NNP CD NNS"]

reviewer = '''DT VBN NNS IN NNS
DT JJ IN JJS CD NNS
DT JJ NNS IN NNS RB
DT JJ NNS TO NNS IN NNS NNS
DT JJ NNS IN NNS'''.split("\n")

reviewer

['DT VBN NNS IN NNS',
 'DT JJ IN JJS CD NNS',
 'DT JJ NNS IN NNS RB',
 'DT JJ NNS TO NNS IN NNS NNS',
 'DT JJ NNS IN NNS']

In [9]:
missed_counts = defaultdict(int)
for p in tqdm(postags):
    for pp in missed_postag_patterns:
        if pp in p:
            missed_counts[pp] += 1

  2%|▏         | 214955/11632617 [00:00<00:10, 1079141.54it/s]

100%|██████████| 11632617/11632617 [00:08<00:00, 1305234.75it/s]


In [10]:
reviewer_counts = defaultdict(int)
for p in tqdm(postags):
    for pp in reviewer:
        if pp in p:
            reviewer_counts[pp] += 1    

  0%|          | 0/11632617 [00:00<?, ?it/s]

100%|██████████| 11632617/11632617 [00:04<00:00, 2331163.38it/s]


In [11]:
reviewer_counts

defaultdict(int,
            {'DT JJ NNS IN NNS': 1565,
             'DT JJ NNS IN NNS RB': 48,
             'DT VBN NNS IN NNS': 60})

In [12]:
reviewer_missed_pattern_counts = defaultdict(int)
reviewer_missed_pattern_spans = defaultdict(list)
for i, (s, p) in enumerate(tqdm(zip(sents, postags))):
    for pattern in reviewer:
        if pattern in p:
            # tokenize sentence
            sent_tokens = tokenize(s)
            searched = re.search(fr'{pattern}', p)
            span = searched.span()
            construction_pattern = p[span[0] : span[1]]
            construction_pattern_span = find_pattern(
                construction_pattern.split(), p.split()
            )
            if sent_tokens == []:
                print(sent_tokens, p)
            else:
                try:
                    sent_span = sent_tokens[construction_pattern_span[0] : construction_pattern_span[1]]
                    if sent_span[0] in ("a", "an", "another"):
                        reviewer_missed_pattern_counts[pattern] += 1
                        reviewer_missed_pattern_spans[pattern].append((i, s, p, sent_span))
                except:
                    pass

11632617it [00:06, 1678527.66it/s]


In [13]:
ohno = []
missed_pattern_counts = defaultdict(int)
missed_pattern_spans = defaultdict(list)
for i, (s, p) in enumerate(tqdm(zip(sents, postags))):
    for pattern in missed_postag_patterns + exceptions:
        if pattern in p:
            # tokenize sentence
            sent_tokens = tokenize(s)
            searched = re.search(fr'{pattern}', p)
            span = searched.span()
            construction_pattern = p[span[0] : span[1]]
            construction_pattern_span = find_pattern(
                construction_pattern.split(), p.split()
            )
            if sent_tokens == []:
                print(sent_tokens, p)
            else:
                try:
                    sent_span = sent_tokens[construction_pattern_span[0] : construction_pattern_span[1]]
                    if sent_span[0] in ("a", "an", "another"):
                        missed_pattern_counts[pattern] += 1
                        missed_pattern_spans[pattern].append((i, s, p, sent_span))
                except:
                    pass

11632617it [00:14, 794304.67it/s]


In [14]:
# set([" ".join(x[-1]) for x in missed_pattern_spans["DT NN HYPH JJ CD NNS"]])
def pattern_instances(pattern):
   unique = set([" ".join(x[-1]) for x in dict(missed_pattern_spans)[pattern]])
   for x in unique:
    print(x)

pattern_instances("DT JJ UH CD NNS")

a further erm sixteen hectares
a s er two slates
a good like six months
a marvelous like two nights


In [15]:
missed_pattern_counts

defaultdict(int,
            {'DT NN CD NNS': 744,
             'DT NN HYPH JJ CD NNS': 54,
             'DT VBN CD NNS': 99,
             'DT JJ CD HYPH CD NNS': 6,
             'DT JJ UH CD NNS': 5,
             'DT NN NN CD NN NNS': 2,
             'DT JJ , JJ JJ NNS': 6,
             'DT JJ CD CD CD NNS': 11,
             'DT JJ NNP CD NNS': 4,
             'DT JJ DT CD CC CD NNS': 1,
             'DT RB RB CD NNS': 1,
             'DT JJ CD HYPH CC HYPH DT HYPH NN NNS': 2,
             'DT JJ CD CC HYPH CD NNS': 1})

In [16]:
full_aann_data = store_aanns(sents, postags, "babylm_sents")
full_aann_data_indef = indefinite_article_aanns(full_aann_data)

In [17]:
len(full_aann_data), len(full_aann_data_indef)

(34407, 5231)

In [18]:
full_aann_data_corpuswise = []

for file in glob.glob("/home/km55359/rawdata/babylm_data/postags_100M/*.train"):
    corpus = re.split(r"(/|.train)", file)[-3]
    corpus_postags = read_file(file)
    corpus_sents = read_file(f"/home/km55359/rawdata/babylm_data/babylm_100M/{corpus}.train")

    full_aann_data_corpuswise.extend(store_aanns(corpus_sents, corpus_postags, corpus))

full_aann_data_corpuswise_indef = indefinite_article_aanns(full_aann_data_corpuswise)

In [19]:
len(full_aann_data_corpuswise), len(full_aann_data_corpuswise_indef)

(33719, 5186)

In [20]:
'''
prune sents in source
for those that do not exist in sent_aanns, add

then do numeral stuff.
separate out numeral --> save
for non numeral, verify and edit
if after numeral is empty, ignore, else add to non_numeral_final.
'''
full_aann_data_corpuswise_indef_pruned = []
for entry in full_aann_data_corpuswise_indef:
    pruned = prune_sentence(entry)
    # idx = find_sentence(pruned['sentence'], sents)
    # pruned['sentence_idx'] = idx
    full_aann_data_corpuswise_indef_pruned.append(pruned)

len(full_aann_data_corpuswise_indef_pruned)

5186

In [21]:
# get diff ids
corpuswise_constructions = [c['construction'] for c in full_aann_data_corpuswise_indef_pruned]
sentencewise_constructions = [c['construction'] for c in full_aann_data_indef]

diff_ids = [i for i, x in enumerate(corpuswise_constructions) if x not in sentencewise_constructions]

In [22]:
full_aann_data_indef_final = []
for entry in full_aann_data_indef:
    edited = verify_and_edit(entry)
    if edited['NUMERAL'] != '':
        full_aann_data_indef_final.append(edited)

len(full_aann_data_indef_final)

2282

In [23]:
actually_detected = utils.read_csv_dict("../data/babylm-aanns/aanns_indef_all.csv")
len(actually_detected)

2301

In [24]:
# for x in all_og_patterns:
#     searched = re.search(CURRENT_REGEX, x)
#     if not searched:
#         print(x)
not_found_in_og = []
not_found_in_og_pattern = set()
for entry in actually_detected:
    idx = int(entry['sentence_idx'])
    p = postags[idx]
    searched = re.search(CURRENT_REGEX, p)
    if searched:
        pass
    else:
        not_found_in_og_pattern.add(entry['pattern'])
        not_found_in_og.append(entry)
# actually_detected[1020]


NameError: name 'CURRENT_REGEX' is not defined

In [25]:
# actually_detected[0], full_aann_data_indef_final[0]
final_unique = set([x['construction'] for x in full_aann_data_indef_final])
actually_unique = set([x['construction'] for x in actually_detected])

actually_unique - final_unique

{'An estimated 10,000 people',
 'An estimated 140 people',
 'An estimated 150 radio stations',
 'An estimated 343,000 people',
 'a combined 52 league appearances',
 'a distinctive two story veranda',
 'a few 37 \xa0 mm weapons',
 'a further 20–24 \xa0 cm',
 'a further 7 \xa0 km',
 'a great many barrows',
 'a great many crumples',
 'a great many fellows',
 'a great many secrets',
 'a half million votes',
 'a hulking 261 \xa0 pounds',
 'a reported 150,000 prisoners',
 'a reported 6,000 apartments',
 'a total 111 floors',
 'an annual 5$000 réis'}

In [26]:
OLD_REGEX = r'\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD|JJ|JJR|JJS|NN|CD\sCD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'
# PERMISSIVE_REGEX = ""
# NEW_CANDIDATE = {
#     "05/24": r'\b(DT)(\s(HYPH|,))?(?:(?:\s(RB)+)*\s(NN CC NN|NN HYPH JJ|VBN|JJ|JJR|JJS)(\s(HYPH|,))?(?:\s(CC))*)+(\s(HYPH|,))?(\s(CD|JJ|JJR|JJS|NN|CD\sCD)(?:\s(TO|CC)\s(CD))*)(\s(HYPH|,))?(\s(NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+',
    
# }

# CURRENT_REGEX = r'\bDT\s(((HYPH|,)\s))?((((RB|CC)\s)+)?((JJ|JJR|JJS|VBN|((NN CC NN |NN HYPH )+(JJ|JJR|JJS|VBN)))((\s(HYPH|,))?)\s))+(((HYPH|,)\s))?(((NN|CC)\s)+)?((CD)(\s(TO|CC|(HYPH|,))(\s(HYPH|,))?)?\s)+(((HYPH|,)\s))?(JJR\s)?((NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+' TODO: KEEP AROUND IF UH IS NOT NEEDED

CURRENT_REGEX = r'\bDT\s(((HYPH|,)\s))?((((RB|CC)\s)+)?((JJ|JJR|JJS|VBN|((NN CC NN |NN HYPH )+(JJ|JJR|JJS|VBN)))((\s(HYPH|,))?)\s))+(((RB)\s)+)?(((HYPH|,)\s))?((UH)\s)?(((NN|CC)\s)+)?((CD)(\s(TO|CC|(HYPH|,))(\s(HYPH|,))?)?\s)+(((HYPH|,)\s))?(JJR\s)?((NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'

DEP_PARSE_REGEX = r'\bDT\s(((HYPH|,)\s))?((((RB|CC|IN)\s)+)?((JJ|NN|JJR|JJS|VBN|((NN CC NN |NN HYPH )+(JJ|JJR|JJS|VBN)))((\s(HYPH|,))?)\s))+(((RB)\s)+)?(((HYPH|,)\s))?((UH)\s)?(((NN|CC)\s)+)?((CD)(\s(TO|CC|(HYPH|,))(\s(HYPH|,))?)?\s)+(((HYPH|,)\s))?(JJR\s)?((NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'

CURRENT_WITH_DT_REGEX = r'\bDT\s(((HYPH|,)\s))?((((RB|CC|IN)\s)+)?((JJ|JJR|JJS|VBN|((NN CC NN |NN HYPH )+(JJ|JJR|JJS|VBN)))((\s(HYPH|,))?)\s))+(((RB)\s)+)?(((HYPH|,)\s))?((UH)\s)?(((NN|CC)\s)+)?((CD)(\s(TO|CC|(HYPH|,))(\s(HYPH|,))?)?\s)+(((HYPH|,)\s))?(JJR\s)?(DT\s)?((NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'

In [27]:
def find_spans(pattern_repository):
    pattern_spans = defaultdict(list)
    for i, (s, p) in enumerate(tqdm(zip(sents, postags))):
        for pattern in pattern_repository:
            searched = re.search(fr'{pattern}', p)
            # if re.search(fr'{pattern}', p):
            if searched:
                # tokenize sentence
                sent_tokens = tokenize(s)
                span = searched.span()
                construction_pattern = p[span[0] : span[1]]
                construction_pattern_span = find_pattern(
                    construction_pattern.split(), p.split()
                )
                if sent_tokens == []:
                    print(sent_tokens, p)
                else:
                    try:
                        sent_span = sent_tokens[construction_pattern_span[0] : construction_pattern_span[1]]
                        if sent_span[0] in ("a", "an", "another"):
                            # missed_pattern_counts[pattern] += 1
                            pattern_spans[pattern].append((i, sent_span))
                    except:
                        pass


    pattern_counts = {k: len(v) for k,v in pattern_spans.items()}
    pattern_spans = dict(pattern_spans)
    return pattern_spans, pattern_counts

In [31]:
postags[2481]

'UH , UH , PRP VBP PRP VBP RB VBN VBN NN IN DT JJ JJ NNS , CC EX VBZ DT NN TO VB VB PRP$ NNS NNS .'

In [30]:
sents[2481]

"Yeah, well, I know I've not 'ad bath for a good few seasons, but there's no need to 'urt my feelin's."

In [41]:
# for entry in actually_detected:
#     if entry['NUMERAL'] == "few":
#         print(entry['pattern'])

# for i, s in enumerate(sents):
#     if "a good few marriages" in s:
#         print(i, s, postags[i])

# pattern = "DT JJ CC JJ JJ CD NNS"
# searched = re.search(CURRENT_REGEX, pattern)
# searched

CURRENT_REGEX_FEW = r'\bDT\s(((HYPH|,)\s))?((((RB|CC)\s)+)?((JJ|JJR|JJS|VBN|((NN CC NN |NN HYPH )+(JJ|JJR|JJS|VBN)))((\s(HYPH|,))?)\s))+(((RB)\s)+)?(((HYPH|,)\s))?((UH)\s)?(((NN|CC)\s)+)?((CD)(\s(TO|CC|(HYPH|,))(\s(HYPH|,))?)?\s)+(((HYPH|,)\s))?(JJR\s)?((NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'


<re.Match object; span=(0, 21), match='DT JJ CC JJ JJ CD NNS'>

In [100]:
repo = [OLD_REGEX, CURRENT_REGEX, DEP_PARSE_REGEX, CURRENT_WITH_DT_REGEX]
regex_spans, regex_counts = find_spans(repo)

0it [00:00, ?it/s]

11632617it [00:48, 239500.44it/s]


In [116]:
unmissed = []
still_missed = []
still_missed_patterns = []
for ms, mp in zip(missed_sents, missed_postags):
    if re.search(CURRENT_REGEX, mp):
        unmissed.append((ms, mp))
    elif re.search(CURRENT_WITH_DT_REGEX, mp):
        unmissed.append((ms, mp))
    elif re.search(DEP_PARSE_REGEX, mp):
        unmissed.append((ms, mp))
    else:
        still_missed.append((ms, mp))
        for mpp in missed_postag_patterns:
            if mpp in mp:
                still_missed_patterns.append(mpp)

for x in set(still_missed_patterns):
    print(x)
    pattern_instances(x)

DT JJ , JJ JJ NNS
a few , few little things
a little , few little notes
a private , coeducational liberal arts
a great , many round ones
a busy , busy few weeks
a few , long swift strokes


In [1]:
set(still_missed_patterns)

NameError: name 'still_missed_patterns' is not defined

In [118]:
len(unmissed), len(still_missed)

(20, 5)

In [43]:
## RANDOM SAMPLE FROM 100M

import random
random.seed(42)

random_idx = random.sample(range(0, len(sents)), 10000000)

In [44]:
babylm_100M_subset_10M = [sents[i] for i in random_idx]
babylm_100M_subset_10M_postags = [postags[i] for i in random_idx]

In [45]:
babylm_100M_subset_10M[0], babylm_100M_subset_10M_postags[0]

("Well, I don't know.", 'UH , PRP VBP RB VB .')

In [46]:
permissive_regex = r'\b(a|an|another)\b .{0,100} \b(two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|several|few|couple|dozen|tens|dozens|hundreds|thousands|millions|billions|[0-9]+)\b.{0,100}\b\w+s\b'


permissed = []
for i, s, p in zip(random_idx, babylm_100M_subset_10M, babylm_100M_subset_10M_postags):
    searched = re.search(permissive_regex, s)
    try:
        init, final = searched.span()
        permissed.append((i, s, p, s[init:final]))
    except:
        pass


# permissed_sampled = permissed[:1000]
permissed_sampled = permissed[1001:2001]
# permissed_sampled = random.sample(permissed, 1000)

len(permissed_sampled)

1000

In [49]:
# with open("../data/permissed_sampled.csv", "w") as f:
#     writer = csv.writer(f)
#     writer.writerow(["sentence", "postags", "span"])
#     for i, s, p, sp in permissed_sampled:
#         writer.writerow([s, p, sp])

# with open("../data/permissed_sampled_test.csv", "w") as f:
#     writer = csv.writer(f)
#     writer.writerow(["sentence", "postags", "span"])
#     for i, s, p, sp in permissed_sampled:
#         writer.writerow([s, p, sp])

In [47]:
# permissed[0]
og_aann_idx = [int(x['sentence_idx']) for x in actually_detected]
idx2span = {int(x['sentence_idx']): x['construction'] for x in actually_detected}

detected_by_og = []
detected_by_current = []
detected_by_current_with_dep_parse = []
detected_by_current_with_dt = []

for i, s, p, sp in permissed_sampled:
    if i in og_aann_idx:
        detected_by_og.append((i, s, p, idx2span[i]))

    current = re.search(CURRENT_REGEX, p)  
    dep_parse = re.search(DEP_PARSE_REGEX, p)
    current_with_dt = re.search(CURRENT_WITH_DT_REGEX, p)  
    if current:
        sent_tokens = tokenize(s)
        span = current.span()
        construction_pattern = p[span[0] : span[1]]
        construction_pattern_span = find_pattern(
            construction_pattern.split(), p.split()
        )
        if sent_tokens == []:
            print(sent_tokens, p)
        else:
            try:
                sent_span = sent_tokens[construction_pattern_span[0] : construction_pattern_span[1]]
                if sent_span[0] in ("a", "an", "another"):
                    detected_by_current.append((i, s, p, sent_span))
            except:
                pass
    if dep_parse:
        sent_tokens = tokenize(s)
        span = dep_parse.span()
        construction_pattern = p[span[0] : span[1]]
        construction_pattern_span = find_pattern(
            construction_pattern.split(), p.split()
        )
        if sent_tokens == []:
            print(sent_tokens, p)
        else:
            try:
                sent_span = sent_tokens[construction_pattern_span[0] : construction_pattern_span[1]]
                if sent_span[0] in ("a", "an", "another"):
                    detected_by_current_with_dep_parse.append((i, s, p, sent_span))
            except:
                pass
    if current_with_dt:
        sent_tokens = tokenize(s)
        span = current_with_dt.span()
        construction_pattern = p[span[0] : span[1]]
        construction_pattern_span = find_pattern(
            construction_pattern.split(), p.split()
        )
        if sent_tokens == []:
            print(sent_tokens, p)
        else:
            try:
                sent_span = sent_tokens[construction_pattern_span[0] : construction_pattern_span[1]]
                if sent_span[0] in ("a", "an", "another"):
                    detected_by_current_with_dt.append((i, s, p, sent_span))
            except:
                pass

In [48]:
len(detected_by_og), len(detected_by_current), len(detected_by_current_with_dep_parse), len(detected_by_current_with_dt)
# len(permissed)

(20, 25, 54, 25)

In [186]:
# for x in detected_by_current:
#     if x not in detected_by_og:
#         print(x)
og_detected_spans = [x[-1] for x in detected_by_og]
current_detected_spans = [" ".join(x[-1]) for x in detected_by_current]
for x in og_detected_spans:
    if x not in current_detected_spans:
        print(x)

a good few marriages
a fucking few minutes
a good few thousands
an extra few pints
a great many more people


In [51]:
# permissed_sampled_annotated = utils.read_csv_dict("../data/permissed_sampled_annotated.csv")
permissed_sampled_annotated = utils.read_csv_dict("../data/permissed_sampled_annotated_test1_1k.csv")

with open("../data/permissed_sampled_annotated_test1_1k_predictions.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["sentence", "pos", "span", "length", "aann", "og_detected", "current_detected", "dep_parse_detected", "current_with_dt_detected"])
    for entry in permissed_sampled_annotated:
        s = entry["sentence"]
        p = entry["pos"]
        # og_detected = int(entry["span"] in idx2span.values())
        og = re.search(ULTRA_REGEX, p)
        current = re.search(CURRENT_REGEX, p)  
        dep_parse = re.search(DEP_PARSE_REGEX, p)
        current_with_dt = re.search(CURRENT_WITH_DT_REGEX, p)
        og_detected = 0
        current_detected = 0
        dep_parse_detected = 0
        current_with_dt_detected = 0
        if og:
            sent_tokens = tokenize(s)
            span = og.span()
            construction_pattern = p[span[0] : span[1]]
            construction_pattern_span = find_pattern(
                construction_pattern.split(), p.split()
            )
            if sent_tokens == []:
                print(sent_tokens, p)
            else:
                try:
                    sent_span = sent_tokens[construction_pattern_span[0] : construction_pattern_span[1]]
                    if sent_span[0] in ("a", "an", "another"):
                        # detected_by_current.append((i, s, p, sent_span))
                        og_detected = 1
                    else:
                        og_detected = 0
                except:
                    pass
        if current:
            sent_tokens = tokenize(s)
            span = current.span()
            construction_pattern = p[span[0] : span[1]]
            construction_pattern_span = find_pattern(
                construction_pattern.split(), p.split()
            )
            if sent_tokens == []:
                print(sent_tokens, p)
            else:
                try:
                    sent_span = sent_tokens[construction_pattern_span[0] : construction_pattern_span[1]]
                    if sent_span[0] in ("a", "an", "another"):
                        # detected_by_current.append((i, s, p, sent_span))
                        current_detected = 1
                    else:
                        current_detected = 0
                except:
                    pass
        if dep_parse:
            sent_tokens = tokenize(s)
            span = dep_parse.span()
            construction_pattern = p[span[0] : span[1]]
            construction_pattern_span = find_pattern(
                construction_pattern.split(), p.split()
            )
            if sent_tokens == []:
                print(sent_tokens, p)
            else:
                try:
                    sent_span = sent_tokens[construction_pattern_span[0] : construction_pattern_span[1]]
                    if sent_span[0] in ("a", "an", "another"):
                        # detected_by_current_with_dep_parse.append((i, s, p, sent_span))
                        dep_parse_detected = 1
                    else:
                        dep_parse_detected = 0
                except:
                    pass
        if current_with_dt:
            sent_tokens = tokenize(s)
            span = current_with_dt.span()
            construction_pattern = p[span[0] : span[1]]
            construction_pattern_span = find_pattern(
                construction_pattern.split(), p.split()
            )
            if sent_tokens == []:
                print(sent_tokens, p)
            else:
                try:
                    sent_span = sent_tokens[construction_pattern_span[0] : construction_pattern_span[1]]
                    if sent_span[0] in ("a", "an", "another"):
                        # detected_by_current_with_dt.append((i, s, p, sent_span))
                        current_with_dt_detected = 1
                    else:
                        current_with_dt_detected = 0
                except:
                    pass
        writer.writerow([entry["sentence"], entry["pos"], entry["span"], entry["length"], entry["aann"], og_detected, current_detected, dep_parse_detected, current_with_dt_detected])

In [178]:
[" ".join(x[-1]) for x in detected_by_current_with_dep_parse]

['a record 1.5 million pre',
 'a further 46,750 tons',
 'a champion 4 years',
 'a flood here about ten years',
 'a season - high four stops',
 'a season - high two pass deflections',
 'a cell 4 dots',
 'a very thrillingly impressive eight points',
 'a town twenty miles',
 'a fun two months',
 'an orchard approximately 1.5 miles',
 'a common ancestor approximately 40 million years',
 'a man and two horses',
 'a reliable 72 hours',
 'a long and two shorts',
 'a couple million songs',
 'a small village roughly 15 kilometres',
 'an additional 26 states',
 'a ring worth nine lakhs',
 'a game 300 runs',
 'a successful three years',
 'an extra two thousand years',
 'a house , four carriages',
 'a phenomenon 100 years',
 'a gentle 20 miles',
 'a community place , about five or six years',
 'an acute accent , 2 letters',
 'a copperhead skin two hours',
 'a small market town 14 miles',
 'a further 16 languages',
 'a solo violin , two violins',
 'an hour and thirty minutes',
 'a spot twenty yards

In [29]:
i = 0
for aann in full_aann_data_indef_final:
    if "more" in aann['construction']:
        print(aann['construction'])
        # i += 1
    # if not detect_aann_advanced(aann['pattern']):
        # i += 1
        # print(aann['construction'], aann['pattern'])

# print(i)

a few more IQ points
a few more minutes
a few more minutes
a few more minutes
a few more inches
a few more steps
a few more times
a few more strokes
a few more steps
a few more minutes
a few more recordings
a few more miles
a few more festival appearances
a few more minutes
a few more years
a few more days
a few more days
a few more houses
a great many more Mice
a few more arrows
a few more logs
a few more things
a few more crumbs
a few more people
a few more instructions
a few more days
a few more dams
a few more appearances
a few more ways
a few more things
a few more examples
a few more things
a great many more things
a few more questions
a few more crumbs
a few more miles
a few more years
a few more runs
a few more calls
a few more minutes
a few more outings
a few more expostulations
a few more inheritances
A few more words
a few more questions
a few more minutes
a few more notes
a few more nuggets
a few more days
a few more notes
a few more hysterics
A few more minutes
a few more 

In [22]:
relevant_idxes = []
for id in diff_ids:
    edited = verify_and_edit(full_aann_data_corpuswise_indef_pruned[id])
    if edited['NUMERAL'].replace(" & ", "").replace("\xa0", "").strip() == '' or edited['ADJ'].replace(" & ", "").replace("\xa0", "").strip() == '':
        pass
    else:
        idxes = find_sentence(unicodedata.normalize("NFKD", edited['sentence']), sents)
        if len(idxes) == 0:
            print(edited)
        else:
            edited['sentence_idx'] = idxes[0]
            edited['source'] = 'babylm_sents'
            full_aann_data_indef_final.append(edited)

NameError: name 'diff_ids' is not defined

In [19]:
len(full_aann_data_indef_final)

2282

In [20]:
## DONE! NOW SEPARATE NUMERAL AND NON NUMERAL AND ENJOY LIFE -- REPEAT FOR THE/THEM
## babylm-aanns-non-nums.csv full_aann_data_indef_final where numeral not in list
## babylm-aanns-all.csv: full_aann_data_indef_final
## babylm-aanns-nums.csv full_aann_data_indef_final where numeral in list

for entry in full_aann_data_indef_final:
    parsed = utils.parse_instance(entry)

full_aann_data_indef_final_non_num = []
full_aann_data_indef_final_num = []

for entry in full_aann_data_indef_final:
    if entry['NUMERAL'] in non_numerals:
        full_aann_data_indef_final_non_num.append(entry)
    else:
        full_aann_data_indef_final_num.append(entry)

In [21]:
len(full_aann_data_indef_final), len(full_aann_data_indef_final_non_num), len(full_aann_data_indef_final_num)

(2282, 1123, 1159)

In [37]:
# save_path = "../data/babylm-aanns/"
# cols = [
#     "source",
#     "sentence",
#     "sentence_idx",
#     "construction",
#     "pattern",
#     "DT",
#     "ADJ",
#     "NUMERAL",
#     "NOUN",
#     "ADV",
# ]

# df = pd.DataFrame(full_aann_data_indef_final)[cols]
# df.reset_index()
# df.to_csv(f"{save_path}/aanns_indef_all.csv", index=False)

# df = pd.DataFrame(full_aann_data_indef_final_num)[cols]
# df.reset_index()
# df.to_csv(f"{save_path}/aanns_indef_num.csv", index=False)

# df = pd.DataFrame(full_aann_data_indef_final_non_num)[cols]
# df.reset_index()
# df.to_csv(f"{save_path}/aanns_indef_non_num.csv", index=False)

In [30]:
full_aann_data_corpuswise_pruned = []
for entry in full_aann_data_corpuswise:
    try:
        pruned = prune_sentence(entry)
        full_aann_data_corpuswise_pruned.append(pruned)
    except:
        pass
    # idx = find_sentence(pruned['sentence'], sents)
    # pruned['sentence_idx'] = idx
    # full_aann_data_corpuswise_pruned.append(pruned)

len(full_aann_data_corpuswise_pruned)

33713

In [39]:
import editors

In [45]:
full_aann_data_indef_final[0]

editors.naan(utils.parse_instance(full_aann_data_indef_final[0])).string

'million few a dollars'

In [53]:
# tokenize(full_aann_data_indef_final[0]['sentence'])
def check_construction_sentence(entry):
    recombined = ' '.join(tokenize(entry['sentence']))
    return entry['construction'] in recombined

def check_construction_sentence_mass(entries):
    for entry in entries:
        recombined = ' '.join(tokenize(entry['sentence']))
        if entry['construction'] not in recombined:
            return False
    return True

In [54]:
check_construction_sentence_mass(full_aann_data_indef_final)

True

In [31]:
full_aann_data_final = []
for entry in full_aann_data:
    edited = verify_and_edit(entry)
    if edited['NUMERAL'] != '':
        full_aann_data_final.append(edited)

# get diff ids
corpuswise_constructions = [c['construction'] for c in full_aann_data_corpuswise_pruned]
sentencewise_constructions = [c['construction'] for c in full_aann_data]

diff_ids = [i for i, x in enumerate(corpuswise_constructions) if x not in sentencewise_constructions]

relevant_idxes = []
for id in diff_ids:
    edited = verify_and_edit(full_aann_data_corpuswise_pruned[id])
    if edited['NUMERAL'].replace(" & ", "").replace("\xa0", "").strip() == '' or edited['ADJ'].replace(" & ", "").replace("\xa0", "").strip() == '':
        pass
    else:
        idxes = find_sentence(unicodedata.normalize("NFKD", edited['sentence']), sents)
        if len(idxes) == 0:
            print(edited)
        else:
            edited['sentence_idx'] = idxes[0]
            edited['source'] = 'babylm_sents'
            full_aann_data_final.append(edited)

In [33]:
for entry in full_aann_data_final:
    parsed = utils.parse_instance(entry)

full_aann_data_final_non_num = []
full_aann_data_final_num = []

for entry in full_aann_data_final:
    if entry['NUMERAL'] in non_numerals:
        full_aann_data_final_non_num.append(entry)
    else:
        full_aann_data_final_num.append(entry)

In [38]:
save_path = "../data/babylm-aanns/"
cols = [
    "source",
    "sentence",
    "sentence_idx",
    "construction",
    "pattern",
    "DT",
    "ADJ",
    "NUMERAL",
    "NOUN",
    "ADV",
]

df = pd.DataFrame(full_aann_data_final)[cols]
df.reset_index()
df.to_csv(f"{save_path}/aanns_all_det_all.csv", index=False)

df = pd.DataFrame(full_aann_data_final_num)[cols]
df.reset_index()
df.to_csv(f"{save_path}/aanns_all_det_num.csv", index=False)

df = pd.DataFrame(full_aann_data_final_non_num)[cols]
df.reset_index()
df.to_csv(f"{save_path}/aanns_all_det_non_num.csv", index=False)

In [8]:
def read_openbooks(path):
    """TODO: make read all"""
    return [i.strip() for i in open(path, encoding="utf-8").readlines() if i.strip() != ""]

openbooks_sents = []
train_files = glob.glob('/home/km55359/rawdata/books1/epubtxt/*.txt')
for file in train_files:
    openbooks_sents.extend(read_openbooks(file))
openbooks_postags = read_openbooks("/home/km55359/rawdata/books1/postags.txt")

In [10]:
len(openbooks_sents), len(openbooks_postags)

(36768629, 36768629)

In [9]:
openbooks_aann_data = store_aanns(openbooks_sents, openbooks_postags, "openbooks")

In [37]:
# IDX=10
# verify_and_edit(ultra_aann[IDX]), ultra_aann[IDX]
numeral_aanns = []
non_numeral_aanns = []
for entry in full_aann_data_corpuswise_indef:
    if entry['NUMERAL'] == '':
        non_numeral_aanns.append(entry)
    else:
        numeral_aanns.append(entry)

len(numeral_aanns), len(non_numeral_aanns)

(1140, 4046)

In [129]:
corpuswise_constructions = [c['construction'] for c in full_aann_data_corpuswise_indef]
sentencewise_constructions = [c['construction'] for c in full_aann_data_indef]

minus1 = [x for x in corpuswise_constructions if x not in sentencewise_constructions]
minus2 = [x for x in sentencewise_constructions if x not in corpuswise_constructions]

len(minus1), len(minus2)

(55, 89)

In [130]:
ultra_aann = []
for entry in full_aann_data_indef:
    seq = entry['pattern']
    searched_advanced = detect_aann_advanced(seq)
    searched_basic = detect_aann_basic(seq)
    searched_ultra = detect_aann_ultra(seq)
    if searched_ultra and not searched_advanced:
        ultra_aann.append(entry)

In [132]:
ultra_aann

[{'DT': 'a',
  'ADJ': 'round',
  'NUMERAL': '',
  'NOUN': 'dozen & blocks',
  'ADV': '',
  'sentence': 'and darted sharply south for a round dozen blocks, then went due east',
  'sentence_idx': 217172,
  'pattern': 'DT JJ NN NNS',
  'source': 'babylm_sents',
  'construction': 'a round dozen blocks'},
 {'DT': 'a',
  'ADJ': 'few',
  'NUMERAL': '',
  'NOUN': 'dozen & episodes',
  'ADV': '',
  'sentence': 'Aside from said obsession he is a slight coward to Judy who he had developed a crush on as shown in a few dozen episodes alongside her husband Hugh.',
  'sentence_idx': 238762,
  'pattern': 'DT JJ NN NNS',
  'source': 'babylm_sents',
  'construction': 'a few dozen episodes'},
 {'DT': 'a',
  'ADJ': 'few',
  'NUMERAL': '',
  'NOUN': 'dozen & processes',
  'ADV': '',
  'sentence': 'And each oligodendrocyte will extend a few processes, maybe up to a few dozen processes each, towards the axons of neurons.',
  'sentence_idx': 407686,
  'pattern': 'DT JJ NN NNS',
  'source': 'babylm_sents',
  '

In [88]:
ultra_aann[1]

{'DT': 'a',
 'ADJ': 'few',
 'NUMERAL': '',
 'NOUN': 'dozen & episodes',
 'ADV': '',
 'sentence': 'Aside from said obsession he is a slight coward to Judy who he had developed a crush on as shown in a few dozen episodes alongside her husband Hugh.',
 'sentence_idx': 238762,
 'pattern': 'DT JJ NN NNS',
 'source': 'babylm_sents',
 'construction': 'a few dozen episodes'}

In [85]:
# store only sentence that has the construction
full_aann_data_indef[2]

{'DT': 'a',
 'ADJ': 'few & more',
 'NUMERAL': '',
 'NOUN': 'minutes',
 'ADV': '',
 'sentence': 'Now we got a few more minutes here, so lets do something a little, snicker',
 'sentence_idx': 772,
 'pattern': 'DT JJ JJR NNS',
 'source': 'babylm_sents',
 'construction': 'a few more minutes'}