In [1]:
import glob
import re
import spacy
import unicodedata
import csv

import pandas as pd

from collections import defaultdict
from minicons.utils import find_pattern
from spacy.lang.en import English
from tqdm import tqdm

import utils
import config
import minicons.utils as mu

from nltk.tokenize import sent_tokenize

from minicons import utils as mu
import inflect
from constructions import AANN

inflector = inflect.engine()
import editors

In [2]:
nlp = English()
tokenizer = nlp.tokenizer

def tokenize(string):
    return [t.text for t in tokenizer(string)]



In [3]:
sent_dir = "/home/km55359/rawdata/babylm_data/babylm_100M/sents/"
# sent_dir = "/Users/kanishka/rawdata/babylm-sents-and-postags/"
sents = utils.read_file(f"{sent_dir}/babylm_sents.txt")
postags = utils.read_file(f"{sent_dir}/postags.txt")

sent_tokens = [tokenize(sent) for sent in sents]

In [4]:
non_numerals = ['few', 'dozen', 'couple', 'several', 'many', 'more']

CURRENT_FEW = r'\bARTICLE\s(((HYPH|,)\s))?((((RB|CC|IN)\s)+)?((JJ|JJR|JJS|VBN|RECORD|((NN CC NN |NN HYPH )+(JJ|JJR|JJS|VBN|RECORD)))((\s(HYPH|,))?)\s))+(((RB)\s)+)?(((HYPH|,)\s))?((UH)\s)?(((NN|CC)\s)+)?((CD|FEW)(\s(TO|CC|(HYPH|,))(\s(HYPH|,))?)?\s)+(((HYPH|,)\s))?((JJR|JJ|VBN)\s)?(ARTICLE\s)?((NNS|NNPS|(NN\sNNS)|((NN|NNS) IN NNS)))+'

def detect_aann_basic(sent_toks, pos):
    p_replaced = []
    for st, pt in zip(sent_toks, pos.split(" ")):
        if st.lower() not in non_numerals + ["a", "an", "another", "an", "-a", "-an"] + ['record']:
            p_replaced.append(pt)
        else:
            if st.lower() in ["a", "another", "an", "-a", "-an"]:
                p_replaced.append("ARTICLE")
            elif st.lower() in non_numerals:
                p_replaced.append("FEW")
            elif st.lower() in ["record"]:
                p_replaced.append("RECORD")
            else:
                p_replaced.append(pt)
        # if st.lower() in non_numerals:
        #     p_replaced.append("FEW")
        # else:
        #     p_replaced.append(pt)
        # if st.lower() in ["a", "another", "an", "-a", "-an"]:
        #     p_replaced.append("ARTICLE")
        # else:
        #     p_replaced.append(pt)
    postag_seq = " ".join(p_replaced)
    # print(postag_seq)
    searched = re.search(CURRENT_FEW, postag_seq)
    result = (False, "", "")
    if searched:
        span = searched.span()
        construction_pattern = postag_seq[span[0] : span[1]]
        construction_pattern_span = find_pattern(
            construction_pattern.split(), postag_seq.split()
        )
        if sent_toks != []:
            try:
                sent_span = sent_toks[construction_pattern_span[0] : construction_pattern_span[1]]
                result = (True, " ".join(sent_span), construction_pattern)
            except:
                pass
    return result

In [6]:
new_idx = []
new_entries = []

for i, (s, st, p) in enumerate(tqdm(zip(sents, sent_tokens, postags))):
    # if i in old_idx:
    #     continue
    detected, span, pattern = detect_aann_basic(st, p)
    if detected:
        new_idx.append(i)
        new_entries.append((i, span, pattern))

11632617it [00:52, 220785.44it/s]


In [8]:
new_entries

[(884, 'a smaller three to four people', 'ARTICLE JJR CD TO CD NNS'),
 (2481, 'a good few seasons', 'ARTICLE JJ FEW NNS'),
 (12856, 'a further two men', 'ARTICLE JJ CD NNS'),
 (12974,
  'an English former professional snooker player and six - times',
  'ARTICLE JJ JJ JJ NN NN CC CD HYPH NNS'),
 (14222, 'an uneventful few months', 'ARTICLE JJ FEW NNS'),
 (14321, 'A further five victories', 'ARTICLE JJ CD NNS'),
 (16576, 'an extra two to three people', 'ARTICLE JJ CD TO CD NNS'),
 (18115, 'a full three days', 'ARTICLE JJ CD NNS'),
 (25036, 'a further three schools', 'ARTICLE JJ CD NNS'),
 (25160, 'a mere 8 years', 'ARTICLE JJ CD NNS'),
 (25474, 'a further three schools', 'ARTICLE JJ CD NNS'),
 (28788, 'a great many friends', 'ARTICLE JJ FEW NNS'),
 (29044, 'a great many people', 'ARTICLE JJ FEW NNS'),
 (29553, 'a great many good points', 'ARTICLE JJ FEW JJ NNS'),
 (29793, 'a great many times', 'ARTICLE JJ FEW NNS'),
 (34347, 'a great many persons', 'ARTICLE JJ FEW NNS'),
 (38009,
  'a sm

In [11]:
decomposed = []
for entry in new_entries:
    idx, span, pos_span = entry
    
    article_span = re.search(r"ARTICLE\s(((HYPH|,)\s))?", pos_span).group(0)
    article_idx = mu.find_pattern(article_span.split(), pos_span.split())
    article_phrase = " ".join(span.split()[article_idx[0]:article_idx[1]])
    # articles.append((idx, article_phrase))

    num_span = re.search(r"((CD|FEW)(\s(TO|CC|(HYPH|,))(\s(HYPH|,))?)?\s)+(((HYPH|,)\s))?", pos_span).group(0)
    num_idx = mu.find_pattern(num_span.split(), pos_span.split())
    num_phrase = " ".join(span.split()[num_idx[0]:num_idx[1]])

    adj_idx = article_idx[-1], num_idx[0]
    adj_phrase = " ".join(span.split()[adj_idx[0]:adj_idx[1]])

    noun_phrase = " ".join(span.split()[num_idx[1]:])

    decomposed.append((idx, span, pos_span, article_phrase, adj_phrase, num_phrase, noun_phrase))

In [14]:
with open("../data/babylm-aanns/aanns_new_decomposed.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["idx", "sentence", "pos", "DT", "ADJ", "NUMERAL", "NOUN"])
    writer.writerows(decomposed)
decomposed

[(884,
  'a smaller three to four people',
  'ARTICLE JJR CD TO CD NNS',
  'a',
  'smaller',
  'three to four',
  'people'),
 (2481,
  'a good few seasons',
  'ARTICLE JJ FEW NNS',
  'a',
  'good',
  'few',
  'seasons'),
 (12856,
  'a further two men',
  'ARTICLE JJ CD NNS',
  'a',
  'further',
  'two',
  'men'),
 (12974,
  'an English former professional snooker player and six - times',
  'ARTICLE JJ JJ JJ NN NN CC CD HYPH NNS',
  'an',
  'English former professional snooker player and',
  'six -',
  'times'),
 (14222,
  'an uneventful few months',
  'ARTICLE JJ FEW NNS',
  'an',
  'uneventful',
  'few',
  'months'),
 (14321,
  'A further five victories',
  'ARTICLE JJ CD NNS',
  'A',
  'further',
  'five',
  'victories'),
 (16576,
  'an extra two to three people',
  'ARTICLE JJ CD TO CD NNS',
  'an',
  'extra',
  'two to three',
  'people'),
 (18115,
  'a full three days',
  'ARTICLE JJ CD NNS',
  'a',
  'full',
  'three',
  'days'),
 (25036,
  'a further three schools',
  'ARTICLE J