In [1]:
import glob
import re
import spacy
import unicodedata

import pandas as pd

from collections import defaultdict
from minicons.utils import find_pattern
from spacy.lang.en import English
from tqdm import tqdm

import utils
import config
import minicons.utils as mu

from nltk.tokenize import sent_tokenize

In [2]:
nlp = English()
tokenizer = nlp.tokenizer

In [14]:
# regexes
ADVANCED_REGEX = r'\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD|JJ|JJR|JJS|CD\sCD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'
ULTRA_REGEX = r'\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD|JJ|JJR|JJS|NN|CD\sCD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'
AANN_REGEX = r"\b(DT)(?:(?:\s(RB))*\s(JJ|JJR|JJS)(?:\s(CC))*)+(\s(CD)(?:\s(TO|CC)\s(CD))*)(\s(NNS|NNPS))+"

cats = ['DT', 'ADJ', 'NUMERAL', 'NOUN', "ADV"]
non_numerals = ['few', 'dozen', 'couple', 'several', 'many', 'more']

pos2cat = {
    'DT': 'DT',
    'JJ': 'ADJ',
    'JJR': 'ADJ',
    'JJS': 'ADJ',
    'CD': 'NUMERAL',
    'NNS': 'NOUN',
    'NNPS': 'NOUN',
    'NN': 'NOUN',
    'IN': 'NOUN',
    'RB': 'ADV',
    'CC': 'CC',
    'TO': 'TO'
}

def read_file(path):
    """TODO: make read all"""
    return [i.strip() for i in open(path, encoding="utf-8").readlines()]

def detect_aann_advanced(sequence):
    return re.search(ADVANCED_REGEX, sequence)

def detect_aann_basic(sequence):
    return re.search(AANN_REGEX, sequence)

def detect_aann_ultra(sequence):
    return re.search(ULTRA_REGEX, sequence)

def tokenize(string):
    return [t.text for t in tokenizer(string)]

def aann_meta(token_seq, const_pattern):
    form_elements = {
        'DT': [],
        'ADJ': [],
        'NUMERAL': [],
        'NOUN': [],
        'ADV': []
    }
    for token, element in zip(token_seq, const_pattern.split()):
        category = pos2cat[element]
        if category in cats:
            form_elements[category].append(token)

    fe_strings = {k:" & ".join(v) for k,v in form_elements.items()}

    # return form_elements
    return fe_strings

def is_ultra(seq):
    searched_advanced = detect_aann_advanced(seq)
    searched_basic = detect_aann_basic(seq)
    searched_ultra = detect_aann_ultra(seq)
    return searched_ultra and not searched_advanced

def store_aann_spans(pos_seq):
    aanns = []
    for i, seq in enumerate(pos_seq):
        searched_advanced = detect_aann_advanced(seq)
        searched_basic = detect_aann_basic(seq)
        searched_ultra = detect_aann_ultra(seq)
        if searched_basic:
            aanns.append((i, seq, searched_basic.span()))
        elif searched_advanced:
            aanns.append((i, seq, searched_advanced.span()))
        elif searched_ultra:
            aanns.append((i, seq, searched_ultra.span()))
    return aanns

def store_aanns(sents, pos_seq, corpus=None):
    full_aann_data = []
    
    # get spans
    aanns = store_aann_spans(pos_seq)

    # given spans, extract sentences and infor relevant for parsing.
    for entry in aanns:
        idx, pos_seq, span = entry
        construction_pattern = pos_seq[span[0] : span[1]]
        construction_pattern_span = find_pattern(
            construction_pattern.split(), pos_seq.split()
        )

        tokens = tokenize(sents[idx])
        if tokens == []:
            pass
        else:
            extracted_token_seq = tokens[
                construction_pattern_span[0] : construction_pattern_span[1]
            ]
            if extracted_token_seq[0].lower().startswith("-a"):
                extracted_token_seq[0] = (
                    extracted_token_seq[0].replace("-a", "a").replace("-A", "A")
                )
            # if indefinite_articles:
            #     condition = extracted_token_seq[0].lower() in ["a", "an", "-a", "another"]
            # else:
            #     condition = True
            # if condition:
            construction_form = " ".join(extracted_token_seq)

            construction_elements = aann_meta(
                extracted_token_seq, construction_pattern
            )
            construction_elements["sentence"] = sents[idx]
            construction_elements["sentence_idx"] = idx
            construction_elements["pattern"] = construction_pattern
            construction_elements["source"] = corpus
            construction_elements["construction"] = construction_form

            if is_ultra(construction_pattern):
                if construction_elements['NOUN'].split(" & ")[0] in non_numerals:
                    full_aann_data.append(construction_elements)
            else:
                full_aann_data.append(construction_elements)

    return full_aann_data

def indefinite_article_aanns(aann_list):
    indef_article_aanns = []
    for entry in aann_list:
        if entry['construction'].split()[0].lower() in ["a", "an", "-a", "another"]:
            indef_article_aanns.append(entry)
    return indef_article_aanns

# 5179 + 37

def verify_and_edit(entry):
    new_entry = entry.copy()
    if new_entry['NUMERAL'] == '':
        adjs = new_entry['ADJ'].split(" & ")
        final_adj = adjs[-1]
        nouns = new_entry['NOUN'].split(" & ")
        first_noun = nouns[0]
        # first check noun:
        if first_noun in non_numerals and len(nouns[1:]) >= 1:
            decomposed_construction = new_entry['construction'].split(" ")
            idx_in_decomp = [i for i, w in enumerate(decomposed_construction) if w == first_noun][-1]
            new_entry['NOUN'] = " & ".join(nouns[1:])
            new_entry['NUMERAL'] = first_noun

            # replace value in position of numeral-adj in pattern to 'CD' so that the aann is parsed.
            new_pattern = new_entry['pattern'].split(" ")
            new_pattern[idx_in_decomp] = "CD"
            new_entry['pattern'] = " ".join(new_pattern)
            
        if final_adj in non_numerals and len(adjs[:-1]) >= 1: # gotcha
            decomposed_construction = new_entry['construction'].split(" ")
            idx_in_decomp = [i for i, w in enumerate(decomposed_construction) if w == final_adj][-1]
            new_entry['ADJ'] = " & ".join(adjs[:-1])
            new_entry['NUMERAL'] = final_adj
            
            # replace value in position of numeral-adj in pattern to 'CD' so that the aann is parsed.
            new_pattern = new_entry['pattern'].split(" ")
            new_pattern[idx_in_decomp] = "CD"
            new_entry['pattern'] = " ".join(new_pattern)
    return new_entry

def prune_sentence(entry):
    new_entry = entry.copy()
    sentences = sent_tokenize(new_entry['sentence'])
    construction = new_entry['construction']
    
    # if there is more than one sentence in the input
    if len(sentences) > 1:
        # check which of them has the construction
        if construction in new_entry['sentence']:
            for s in sentences:
                if construction in s:
                    final_sentence = s
                    break
        else:
            for s in sentences:
                reconstructed = ' '.join(tokenize(s))
                if construction in reconstructed:
                    final_sentence = s
                    break
        # make change
        new_entry['sentence'] = final_sentence
        
    return new_entry

def find_sentence(target, source):
    idx = []
    for i, s in enumerate(source):
        if unicodedata.normalize("NFKD", s) == target:
            idx.append(i)
    return idx

In [15]:
5179 + 37

5216

In [16]:
# load sents and postags
sent_dir = "/home/km55359/rawdata/babylm_data/babylm_100M/sents/"
sents = utils.read_file(f"{sent_dir}/babylm_sents.txt")
postags = utils.read_file(f"{sent_dir}/postags.txt")

In [17]:
full_aann_data = store_aanns(sents, postags, "babylm_sents")
full_aann_data_indef = indefinite_article_aanns(full_aann_data)

In [18]:
len(full_aann_data), len(full_aann_data_indef)

(34407, 5231)

In [19]:
full_aann_data_corpuswise = []

for file in glob.glob("/home/km55359/rawdata/babylm_data/postags_100M/*.train"):
    corpus = re.split(r"(/|.train)", file)[-3]
    corpus_postags = read_file(file)
    corpus_sents = read_file(f"/home/km55359/rawdata/babylm_data/babylm_100M/{corpus}.train")

    full_aann_data_corpuswise.extend(store_aanns(corpus_sents, corpus_postags, corpus))

full_aann_data_corpuswise_indef = indefinite_article_aanns(full_aann_data_corpuswise)

In [20]:
len(full_aann_data_corpuswise), len(full_aann_data_corpuswise_indef)

(33719, 5186)

In [21]:
'''
prune sents in source
for those that do not exist in sent_aanns, add

then do numeral stuff.
separate out numeral --> save
for non numeral, verify and edit
if after numeral is empty, ignore, else add to non_numeral_final.
'''
full_aann_data_corpuswise_indef_pruned = []
for entry in full_aann_data_corpuswise_indef:
    pruned = prune_sentence(entry)
    # idx = find_sentence(pruned['sentence'], sents)
    # pruned['sentence_idx'] = idx
    full_aann_data_corpuswise_indef_pruned.append(pruned)

len(full_aann_data_corpuswise_indef_pruned)

5186

In [22]:
# get diff ids
corpuswise_constructions = [c['construction'] for c in full_aann_data_corpuswise_indef_pruned]
sentencewise_constructions = [c['construction'] for c in full_aann_data_indef]

diff_ids = [i for i, x in enumerate(corpuswise_constructions) if x not in sentencewise_constructions]

In [23]:
full_aann_data_indef_final = []
for entry in full_aann_data_indef:
    edited = verify_and_edit(entry)
    if edited['NUMERAL'] != '':
        full_aann_data_indef_final.append(edited)

len(full_aann_data_indef_final)

2282

In [24]:
relevant_idxes = []
for id in diff_ids:
    edited = verify_and_edit(full_aann_data_corpuswise_indef_pruned[id])
    if edited['NUMERAL'].replace(" & ", "").replace("\xa0", "").strip() == '' or edited['ADJ'].replace(" & ", "").replace("\xa0", "").strip() == '':
        pass
    else:
        idxes = find_sentence(unicodedata.normalize("NFKD", edited['sentence']), sents)
        if len(idxes) == 0:
            print(edited)
        else:
            edited['sentence_idx'] = idxes[0]
            edited['source'] = 'babylm_sents'
            full_aann_data_indef_final.append(edited)

In [25]:
len(full_aann_data_indef_final)

2301

In [26]:
## DONE! NOW SEPARATE NUMERAL AND NON NUMERAL AND ENJOY LIFE -- REPEAT FOR THE/THEM
## babylm-aanns-non-nums.csv full_aann_data_indef_final where numeral not in list
## babylm-aanns-all.csv: full_aann_data_indef_final
## babylm-aanns-nums.csv full_aann_data_indef_final where numeral in list

for entry in full_aann_data_indef_final:
    parsed = utils.parse_instance(entry)

full_aann_data_indef_final_non_num = []
full_aann_data_indef_final_num = []

for entry in full_aann_data_indef_final:
    if entry['NUMERAL'] in non_numerals:
        full_aann_data_indef_final_non_num.append(entry)
    else:
        full_aann_data_indef_final_num.append(entry)

In [36]:
len(full_aann_data_indef_final), len(full_aann_data_indef_final_non_num), len(full_aann_data_indef_final_num)

(2301, 1127, 1174)

In [37]:
save_path = "../data/babylm-aanns/"
cols = [
    "source",
    "sentence",
    "sentence_idx",
    "construction",
    "pattern",
    "DT",
    "ADJ",
    "NUMERAL",
    "NOUN",
    "ADV",
]

df = pd.DataFrame(full_aann_data_indef_final)[cols]
df.reset_index()
df.to_csv(f"{save_path}/aanns_indef_all.csv", index=False)

df = pd.DataFrame(full_aann_data_indef_final_num)[cols]
df.reset_index()
df.to_csv(f"{save_path}/aanns_indef_num.csv", index=False)

df = pd.DataFrame(full_aann_data_indef_final_non_num)[cols]
df.reset_index()
df.to_csv(f"{save_path}/aanns_indef_non_num.csv", index=False)

In [30]:
full_aann_data_corpuswise_pruned = []
for entry in full_aann_data_corpuswise:
    try:
        pruned = prune_sentence(entry)
        full_aann_data_corpuswise_pruned.append(pruned)
    except:
        pass
    # idx = find_sentence(pruned['sentence'], sents)
    # pruned['sentence_idx'] = idx
    # full_aann_data_corpuswise_pruned.append(pruned)

len(full_aann_data_corpuswise_pruned)

33713

In [39]:
import editors

In [45]:
full_aann_data_indef_final[0]

editors.naan(utils.parse_instance(full_aann_data_indef_final[0])).string

'million few a dollars'

In [53]:
# tokenize(full_aann_data_indef_final[0]['sentence'])
def check_construction_sentence(entry):
    recombined = ' '.join(tokenize(entry['sentence']))
    return entry['construction'] in recombined

def check_construction_sentence_mass(entries):
    for entry in entries:
        recombined = ' '.join(tokenize(entry['sentence']))
        if entry['construction'] not in recombined:
            return False
    return True

In [54]:
check_construction_sentence_mass(full_aann_data_indef_final)

True

In [31]:
full_aann_data_final = []
for entry in full_aann_data:
    edited = verify_and_edit(entry)
    if edited['NUMERAL'] != '':
        full_aann_data_final.append(edited)

# get diff ids
corpuswise_constructions = [c['construction'] for c in full_aann_data_corpuswise_pruned]
sentencewise_constructions = [c['construction'] for c in full_aann_data]

diff_ids = [i for i, x in enumerate(corpuswise_constructions) if x not in sentencewise_constructions]

relevant_idxes = []
for id in diff_ids:
    edited = verify_and_edit(full_aann_data_corpuswise_pruned[id])
    if edited['NUMERAL'].replace(" & ", "").replace("\xa0", "").strip() == '' or edited['ADJ'].replace(" & ", "").replace("\xa0", "").strip() == '':
        pass
    else:
        idxes = find_sentence(unicodedata.normalize("NFKD", edited['sentence']), sents)
        if len(idxes) == 0:
            print(edited)
        else:
            edited['sentence_idx'] = idxes[0]
            edited['source'] = 'babylm_sents'
            full_aann_data_final.append(edited)

In [33]:
for entry in full_aann_data_final:
    parsed = utils.parse_instance(entry)

full_aann_data_final_non_num = []
full_aann_data_final_num = []

for entry in full_aann_data_final:
    if entry['NUMERAL'] in non_numerals:
        full_aann_data_final_non_num.append(entry)
    else:
        full_aann_data_final_num.append(entry)

In [38]:
save_path = "../data/babylm-aanns/"
cols = [
    "source",
    "sentence",
    "sentence_idx",
    "construction",
    "pattern",
    "DT",
    "ADJ",
    "NUMERAL",
    "NOUN",
    "ADV",
]

df = pd.DataFrame(full_aann_data_final)[cols]
df.reset_index()
df.to_csv(f"{save_path}/aanns_all_det_all.csv", index=False)

df = pd.DataFrame(full_aann_data_final_num)[cols]
df.reset_index()
df.to_csv(f"{save_path}/aanns_all_det_num.csv", index=False)

df = pd.DataFrame(full_aann_data_final_non_num)[cols]
df.reset_index()
df.to_csv(f"{save_path}/aanns_all_det_non_num.csv", index=False)

In [8]:
def read_openbooks(path):
    """TODO: make read all"""
    return [i.strip() for i in open(path, encoding="utf-8").readlines() if i.strip() != ""]

openbooks_sents = []
train_files = glob.glob('/home/km55359/rawdata/books1/epubtxt/*.txt')
for file in train_files:
    openbooks_sents.extend(read_openbooks(file))
openbooks_postags = read_openbooks("/home/km55359/rawdata/books1/postags.txt")

In [10]:
len(openbooks_sents), len(openbooks_postags)

(36768629, 36768629)

In [9]:
openbooks_aann_data = store_aanns(openbooks_sents, openbooks_postags, "openbooks")

In [37]:
# IDX=10
# verify_and_edit(ultra_aann[IDX]), ultra_aann[IDX]
numeral_aanns = []
non_numeral_aanns = []
for entry in full_aann_data_corpuswise_indef:
    if entry['NUMERAL'] == '':
        non_numeral_aanns.append(entry)
    else:
        numeral_aanns.append(entry)

len(numeral_aanns), len(non_numeral_aanns)

(1140, 4046)

In [129]:
corpuswise_constructions = [c['construction'] for c in full_aann_data_corpuswise_indef]
sentencewise_constructions = [c['construction'] for c in full_aann_data_indef]

minus1 = [x for x in corpuswise_constructions if x not in sentencewise_constructions]
minus2 = [x for x in sentencewise_constructions if x not in corpuswise_constructions]

len(minus1), len(minus2)

(55, 89)

In [130]:
ultra_aann = []
for entry in full_aann_data_indef:
    seq = entry['pattern']
    searched_advanced = detect_aann_advanced(seq)
    searched_basic = detect_aann_basic(seq)
    searched_ultra = detect_aann_ultra(seq)
    if searched_ultra and not searched_advanced:
        ultra_aann.append(entry)

In [132]:
ultra_aann

[{'DT': 'a',
  'ADJ': 'round',
  'NUMERAL': '',
  'NOUN': 'dozen & blocks',
  'ADV': '',
  'sentence': 'and darted sharply south for a round dozen blocks, then went due east',
  'sentence_idx': 217172,
  'pattern': 'DT JJ NN NNS',
  'source': 'babylm_sents',
  'construction': 'a round dozen blocks'},
 {'DT': 'a',
  'ADJ': 'few',
  'NUMERAL': '',
  'NOUN': 'dozen & episodes',
  'ADV': '',
  'sentence': 'Aside from said obsession he is a slight coward to Judy who he had developed a crush on as shown in a few dozen episodes alongside her husband Hugh.',
  'sentence_idx': 238762,
  'pattern': 'DT JJ NN NNS',
  'source': 'babylm_sents',
  'construction': 'a few dozen episodes'},
 {'DT': 'a',
  'ADJ': 'few',
  'NUMERAL': '',
  'NOUN': 'dozen & processes',
  'ADV': '',
  'sentence': 'And each oligodendrocyte will extend a few processes, maybe up to a few dozen processes each, towards the axons of neurons.',
  'sentence_idx': 407686,
  'pattern': 'DT JJ NN NNS',
  'source': 'babylm_sents',
  '

In [88]:
ultra_aann[1]

{'DT': 'a',
 'ADJ': 'few',
 'NUMERAL': '',
 'NOUN': 'dozen & episodes',
 'ADV': '',
 'sentence': 'Aside from said obsession he is a slight coward to Judy who he had developed a crush on as shown in a few dozen episodes alongside her husband Hugh.',
 'sentence_idx': 238762,
 'pattern': 'DT JJ NN NNS',
 'source': 'babylm_sents',
 'construction': 'a few dozen episodes'}

In [85]:
# store only sentence that has the construction
full_aann_data_indef[2]

{'DT': 'a',
 'ADJ': 'few & more',
 'NUMERAL': '',
 'NOUN': 'minutes',
 'ADV': '',
 'sentence': 'Now we got a few more minutes here, so lets do something a little, snicker',
 'sentence_idx': 772,
 'pattern': 'DT JJ JJR NNS',
 'source': 'babylm_sents',
 'construction': 'a few more minutes'}