In [1]:
import re
import csv
import inflect

from collections import defaultdict
from dataclasses import dataclass
from minicons import utils as mu
from tqdm import tqdm

In [2]:
inflector = inflect.engine()

In [3]:
aanns = []
with open("rawdata/books1/openbooks_aanns.csv", "r") as f:
    reader = csv.DictReader(f)
    for line in reader:
        aanns.append(line)

In [4]:
@dataclass
class AANN:
    article: str
    adjective: str
    numeral: str
    noun: str

    def __post_init__(self):
        self.string = f"{self.article} {self.adjective} {self.numeral} {self.noun}"

In [5]:
adj_pattern = r'((?=(JJR|JJS|JJ|RB|CC))(.*)(JJR|JJS|JJ))|JJR|JJS|JJ'
num_pattern = r'(?:(?:CD|CC|TO)\s+){2,}CD|CD'

def parse_aann(string, pattern):
    tokens = string.split()
    adj_span = re.search(adj_pattern, pattern).group(0)
    num_span = re.search(num_pattern, pattern).group(0)

    adjs_idx = mu.find_pattern(adj_span.split(), pattern.split())
    nums_idx = mu.find_pattern(num_span.split(), pattern.split())

    parsed = AANN(tokens[0], " ".join(tokens[adjs_idx[0]:adjs_idx[1]]), " ".join(tokens[nums_idx[0]:nums_idx[1]]), " ".join(tokens[nums_idx[1]:]))
    return parsed

def parse_instance(aann):
    return parse_aann(aann['construction'], aann['pattern'])

In [6]:
parse_instance(aanns[10])

AANN(article='a', adjective='good', numeral='twenty', noun='years')

In [7]:
def corrupt_order(aann):
    return f"{aann.article} {aann.numeral} {aann.adjective} {aann.noun}"

def corrupt_article(aann):
    return f"{aann.numeral} {aann.adjective} {aann.noun}"

def corrupt_modifier(aann):
    return f"{aann.article} {aann.numeral} {aann.noun}"

def corrupt_noun_num(aann):
    return f"{aann.article} {aann.adjective} {aann.numeral} {inflector.singular_noun(aann.noun.split(' ')[-1])}"

In [8]:
def corrupt_all(aann):
    print(f"Original: {aann.string}")
    print(f"Order Swap: {corrupt_order(aann)}")
    print(f"No Article: {corrupt_article(aann)}")
    print(f"No Modifier: {corrupt_modifier(aann)}")
    print(f"Noun Number: {corrupt_noun_num(aann)}")

In [9]:
corrupt_all(parse_instance(aanns[10]))

Original: a good twenty years
Order Swap: a twenty good years
No Article: twenty good years
No Modifier: a twenty years
Noun Number: a good twenty year


In [10]:
corruption_types = {
    'order_swap': corrupt_order,
    'no_article': corrupt_article,
    'no_modifier': corrupt_modifier,
    'noun_number': corrupt_noun_num
}

In [11]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7fb46fabb390>

In [12]:
bad_construction_patterns = [
    'DT JJ CC CD NNS',
    'DT JJ JJ CC CD NNS',
    'DT JJ JJR CC CD TO CD NNS',
    'DT JJ JJ CC CD CC CD NNS'
]

unwanted_nouns = [
    "'s",
    'ks',
    'k',
    "CC & 's",
    "Hjarrleth",
    "years & homosexuals",
    "zlotys",
    "Rads",
    "metres10",
    "pounds & bs",
    "G",
    "g",
    "°",
    "kez",
    'a',
    '"',
    "C",
    "kms2",
    "'",
    "écus",
    "times?",
    "minutos",
    "anos",
    "Kleti",
    "Tusse",
    "MPH",
    'mph',
    "dollarsworth",
    'Etherael',
    'Draugari',
    'drachma',
    'folk',
    'ha',
    'grotz',
    'li',
    'lira',
    'people--',
    'mL',
    'mi',
    'ml',
]

In [13]:
def construction_pieces(sentence, construction):
    left, right = mu.character_span(sentence, construction)
    return sentence[:left], sentence[left:right], sentence[right:]

def reconstruct(left, middle, right):
    string = " ".join([left, middle, right]).strip()
    return re.sub(r' {2,}', ' ', string)

In [14]:
corrupted_sents = {
    k: [] for k, v in corruption_types.items()
}

final_aanns = []

uninflected_counter = 0

counter = 0
for aann in tqdm(aanns):
    # manual fix
    if aann['sentence_idx'] == '23546818':
        aann['construction'] == "a few thousand dollars"
        aann['DT'] = "a"
        aann['ADJ'] = "few"
        aann['NUMERAL'] = "thousand"
        aann['NOUN'] = "dollars"
    
    if aann['sentence_idx'] == '33259998':
        aann['construction'] = "a few zillion comets"
        aann['sentence'] = aann['sentence'].replace("comets./", "comets.")
        aann['NOUN'] = "comets"    
        
    if aann['NOUN'] not in unwanted_nouns and aann['pattern'] not in bad_construction_patterns\
    and "& 's" not in aann['NOUN'] and "_" not in aann['NOUN'] and "& *" not in aann['NOUN']:
        sents = [s.text for s in nlp(aann['sentence']).sents]
        relevant_sent = None
        for sent in sents:
            if aann['construction'] in sent:
                relevant_sent = sent
    
        parsed_aann = parse_aann(aann['construction'], aann['pattern'])

        inflected = inflector.singular_noun(parsed_aann.noun)
        if inflected:
            counter += 1
        
            left, construction, right = construction_pieces(relevant_sent, aann['construction'])
            for corruption_type, corruption_function in corruption_types.items():
                corrupted_construction = corruption_function(parsed_aann)
                reconstructed = reconstruct(left, corrupted_construction, right)
                obj = {
                    'idx': counter, 
                    'construction': aann['construction'], 
                    'corrupted_construction': corrupted_construction, 
                    'corrupted_sentence': reconstructed
                }
                corrupted_sents[corruption_type].append(obj)
            
            final_aanns.append({
                'idx': counter,
                'source': aann['source'],
                'sentence': relevant_sent,
                'construction': aann['construction'],
                'pattern': aann['pattern'],
                'DT': aann['DT'],
                'ADJ': aann['ADJ'],
                'NUMERAL': aann['NUMERAL'],
                'NOUN': aann['NOUN'],
                'ADV': aann['ADV']
            })
        else:
            uninflected_counter += 1

100%|███████████████████████████████████████████████████| 21476/21476 [00:07<00:00, 2989.85it/s]


In [15]:
len(final_aanns), len(aanns)

(21294, 21476)