In [3]:
import re
import csv
import inflect
import pathlib

from collections import defaultdict
from dataclasses import dataclass
from minicons import utils as mu
from tqdm import tqdm

In [5]:
inflector = inflect.engine()

In [6]:
aanns = []
with open("../data/openbooks_aanns.csv", "r") as f:
    reader = csv.DictReader(f)
    for line in reader:
        aanns.append(line)

In [7]:
@dataclass
class AANN:
    article: str
    adjective: str
    numeral: str
    noun: str

    def __post_init__(self):
        self.string = f"{self.article} {self.adjective} {self.numeral} {self.noun}"

In [8]:
adj_pattern = r'((?=(JJR|JJS|JJ|RB|CC))(.*)(JJR|JJS|JJ))|JJR|JJS|JJ'
num_pattern = r'(?:(?:CD|CC|TO)\s+){2,}CD|CD'

def parse_aann(string, pattern):
    tokens = string.split()
    adj_span = re.search(adj_pattern, pattern).group(0)
    num_span = re.search(num_pattern, pattern).group(0)

    adjs_idx = mu.find_pattern(adj_span.split(), pattern.split())
    nums_idx = mu.find_pattern(num_span.split(), pattern.split())

    parsed = AANN(tokens[0], " ".join(tokens[adjs_idx[0]:adjs_idx[1]]), " ".join(tokens[nums_idx[0]:nums_idx[1]]), " ".join(tokens[nums_idx[1]:]))
    return parsed

def parse_instance(aann):
    return parse_aann(aann['construction'], aann['pattern'])

In [9]:
parse_instance(aanns[10])

AANN(article='a', adjective='good', numeral='twenty', noun='years')

In [10]:
def corrupt_order(aann):
    return f"{aann.article} {aann.numeral} {aann.adjective} {aann.noun}"

def corrupt_article(aann):
    return f"{aann.numeral} {aann.adjective} {aann.noun}"

def corrupt_modifier(aann):
    return f"{aann.article} {aann.numeral} {aann.noun}"

def corrupt_noun_num(aann):
    return f"{aann.article} {aann.adjective} {aann.numeral} {inflector.singular_noun(aann.noun.split(' ')[-1])}"

In [11]:
def corrupt_all(aann):
    print(f"Original: {aann.string}")
    print(f"Order Swap: {corrupt_order(aann)}")
    print(f"No Article: {corrupt_article(aann)}")
    print(f"No Modifier: {corrupt_modifier(aann)}")
    print(f"Noun Number: {corrupt_noun_num(aann)}")

In [12]:
corrupt_all(parse_instance(aanns[10]))

Original: a good twenty years
Order Swap: a twenty good years
No Article: twenty good years
No Modifier: a twenty years
Noun Number: a good twenty year


In [13]:
corruption_types = {
    'order_swap': corrupt_order,
    'no_article': corrupt_article,
    'no_modifier': corrupt_modifier,
    'noun_number': corrupt_noun_num
}

In [14]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7f3301654210>

In [15]:
bad_construction_patterns = [
    'DT JJ CC CD NNS',
    'DT JJ JJ CC CD NNS',
    'DT JJ JJR CC CD TO CD NNS',
    'DT JJ JJ CC CD CC CD NNS'
]

unwanted_nouns = [
    "'s",
    'ks',
    'k',
    "CC & 's",
    "Hjarrleth",
    "years & homosexuals",
    "zlotys",
    "Rads",
    "metres10",
    "pounds & bs",
    "G",
    "g",
    "°",
    "kez",
    'a',
    '"',
    "C",
    "kms2",
    "'",
    "écus",
    "times?",
    "minutos",
    "anos",
    "Kleti",
    "Tusse",
    "MPH",
    'mph',
    "dollarsworth",
    'Etherael',
    'Draugari',
    'drachma',
    'folk',
    'ha',
    'grotz',
    'li',
    'lira',
    'people--',
    'mL',
    'mi',
    'ml',
]

In [16]:
def construction_pieces(sentence, construction):
    left, right = mu.character_span(sentence, construction)
    return sentence[:left], sentence[left:right], sentence[right:]

def reconstruct(left, middle, right):
    string = " ".join([left, middle, right]).strip()
    return re.sub(r' {2,}', ' ', string)

In [52]:
corrupted_sents = {
    k: [] for k, v in corruption_types.items()
}

final_aanns = []

uninflected_counter = 0

counter = 0
for aann in tqdm(aanns):
    # manual fix
    if aann['sentence_idx'] == '23546818':
        aann['construction'] == "a few thousand dollars"
        aann['DT'] = "a"
        aann['ADJ'] = "few"
        aann['NUMERAL'] = "thousand"
        aann['NOUN'] = "dollars"
    
    if aann['sentence_idx'] == '33259998':
        aann['construction'] = "a few zillion comets"
        aann['sentence'] = aann['sentence'].replace("comets./", "comets.")
        aann['NOUN'] = "comets"

        
    if aann['NOUN'] not in unwanted_nouns and aann['pattern'] not in bad_construction_patterns\
    and "& 's" not in aann['NOUN'] and "_" not in aann['NOUN'] and "& *" not in aann['NOUN']:
        sents = [s.text for s in nlp(aann['sentence']).sents]
        relevant_sent = None
        for sent in sents:
            if aann['construction'] in sent:
                relevant_sent = sent

        sent_len = len(relevant_sent.split())

        if sent_len <= 100:
    
            parsed_aann = parse_aann(aann['construction'], aann['pattern'])
    
            inflected = inflector.singular_noun(parsed_aann.noun)
            if inflected:
                counter += 1
            
                left, construction, right = construction_pieces(relevant_sent, aann['construction'])
                for corruption_type, corruption_function in corruption_types.items():
                    corrupted_construction = corruption_function(parsed_aann)
                    reconstructed = reconstruct(left, corrupted_construction, right)
                    obj = {
                        'idx': counter, 
                        'construction': aann['construction'], 
                        'corrupted_construction': corrupted_construction, 
                        'corrupted_sentence': reconstructed
                    }
                    corrupted_sents[corruption_type].append(obj)
                
                final_aanns.append({
                    'idx': counter,
                    'source': aann['source'],
                    'sentence': relevant_sent,
                    'construction': aann['construction'],
                    'pattern': aann['pattern'],
                    'DT': aann['DT'],
                    'ADJ': aann['ADJ'],
                    'NUMERAL': aann['NUMERAL'],
                    'NOUN': aann['NOUN'],
                    'ADV': aann['ADV']
                })
            else:
                uninflected_counter += 1

100%|███████████████████████████████████████████████████| 21476/21476 [00:05<00:00, 3929.80it/s]


In [53]:
len(final_aanns), len(aanns)

(21230, 21476)

In [54]:
def write_csv(list, path, headers):
    with open(path, "w") as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()
        for line in list:
            writer.writerow(line)

In [62]:
pathlib.Path('../data/openbooks/').mkdir(parents=True, exist_ok=True)
write_csv(final_aanns, "../data/openbooks/aanns_good.csv", final_aanns[0].keys())

In [56]:
len(corrupted_sents['order_swap'])

21230

In [58]:
128 * 41

5248

In [60]:
corrupted_sents['order_swap'][5248:5248+256]

[{'idx': 5249,
  'construction': 'a good four hours',
  'corrupted_construction': 'a four good hours',
  'corrupted_sentence': '" Washington is a four good hours \' drive from here , " Matt said . "'},
 {'idx': 5250,
  'construction': 'a few hundred yards',
  'corrupted_construction': 'a hundred few yards',
  'corrupted_sentence': "A couple of hundred yards before they came in sight of the Senator 's house , Matt told Zoé to drop him off and take the car a hundred few yards beyond the house , turn it , and wait to him to phone ."},
 {'idx': 5251,
  'construction': 'a good four inches',
  'corrupted_construction': 'a four good inches',
  'corrupted_sentence': 'Jim stood a four good inches taller than George .'},
 {'idx': 5252,
  'construction': 'a blessed six hours',
  'corrupted_construction': 'a six blessed hours',
  'corrupted_sentence': 'Laurie must have tired herself out playing with the little rattle and ended up sleeping for a six blessed hours straight , which all the medical bo

In [61]:
for corruption_type, corrupted_data in corrupted_sents.items():
    write_csv(corrupted_data, f"../data/openbooks/aanns_{corruption_type}.csv", corrupted_data[0].keys())

In [32]:
from torch.utils.data import DataLoader

In [33]:
dl = DataLoader(final_aanns, batch_size = 10)

In [34]:
for line in dl:
    pass

In [35]:
line

{'idx': tensor([21291, 21292, 21293, 21294]),
 'source': ['openbookscorpus',
  'openbookscorpus',
  'openbookscorpus',
  'openbookscorpus'],
 'sentence': ['Tommy was a good 300 pounds , and Janet was , if the last Christmas card photo was accurate , even bigger .',
  "Gathered there in my father 's house for the after - funeral party were a good 20 or 30 people , most of whom were relatives . (",
  "It had been a good two years since he 'd felt it – about the time that had elapsed since his doctor had prescribed the little pills that took the anxiety away and flattened him into a being who counted stairs and ceiling tiles and spent an inordinate amount of time every Sunday making his sock drawer just so .",
  "It was already a good 20 minutes past the time when he 'd have begun his evening rituals of uncorking a bottle of wine and preparing dinner ."],
 'construction': ['a good 300 pounds',
  'a good 20 or 30 people',
  'a good two years',
  'a good 20 minutes'],
 'pattern': ['DT JJ CD

In [None]:
# save two results: one as <model>_full-sentences.csv, one as <model>_construction-only.csv

In [1]:
import pandas as pd

In [2]:
df_dict = {'a': [1,2,3,4], 'b': [5,6,7,8]}
pd.DataFrame(df_dict)

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7
3,4,8


In [22]:
lengths = []
for i, aann in enumerate(final_aanns):
    length = len(aann['sentence'].split())
    lengths.append(length)

In [49]:
len(lengths)

21294

In [51]:
len([i for i in lengths if i <= 100])

21230

In [23]:
lengths

[25,
 44,
 20,
 44,
 21,
 38,
 42,
 42,
 25,
 13,
 25,
 10,
 23,
 17,
 11,
 27,
 38,
 19,
 38,
 21,
 17,
 41,
 26,
 20,
 30,
 27,
 19,
 19,
 11,
 17,
 23,
 24,
 98,
 17,
 37,
 33,
 14,
 12,
 26,
 16,
 16,
 41,
 31,
 15,
 8,
 18,
 21,
 22,
 18,
 28,
 15,
 45,
 48,
 14,
 10,
 31,
 24,
 21,
 17,
 20,
 13,
 26,
 25,
 11,
 10,
 20,
 10,
 16,
 12,
 35,
 8,
 13,
 17,
 28,
 24,
 41,
 49,
 27,
 11,
 41,
 33,
 33,
 9,
 22,
 15,
 31,
 21,
 22,
 30,
 25,
 18,
 51,
 14,
 31,
 20,
 16,
 36,
 13,
 48,
 16,
 25,
 12,
 26,
 22,
 20,
 28,
 20,
 16,
 26,
 31,
 44,
 36,
 10,
 12,
 29,
 27,
 20,
 41,
 21,
 33,
 25,
 33,
 12,
 34,
 10,
 16,
 8,
 11,
 20,
 15,
 18,
 11,
 38,
 27,
 19,
 39,
 11,
 10,
 23,
 13,
 18,
 28,
 7,
 14,
 29,
 17,
 22,
 32,
 40,
 22,
 30,
 23,
 22,
 37,
 14,
 28,
 38,
 17,
 25,
 46,
 43,
 20,
 22,
 36,
 21,
 38,
 32,
 23,
 18,
 47,
 50,
 29,
 35,
 28,
 11,
 47,
 40,
 40,
 14,
 32,
 32,
 14,
 34,
 9,
 8,
 41,
 27,
 37,
 50,
 30,
 37,
 12,
 21,
 12,
 31,
 34,
 18,
 21,
 12,
 25,
 20,
 1

In [24]:
from semantic_memory.list_utils import argmax

In [30]:
import torch

In [39]:
torch.topk(torch.tensor(lengths), k=100)

torch.return_types.topk(
values=tensor([1415,  432,  347,  311,  274,  243,  234,  228,  215,  212,  207,  202,
         197,  196,  185,  183,  175,  173,  169,  168,  164,  162,  160,  151,
         150,  148,  143,  140,  132,  131,  130,  129,  129,  129,  129,  128,
         126,  125,  123,  123,  121,  120,  120,  120,  119,  116,  115,  113,
         113,  112,  109,  107,  107,  106,  105,  104,  104,  103,  103,  102,
         102,  102,  101,  101,  100,  100,  100,   99,   99,   99,   99,   99,
          98,   98,   98,   97,   96,   95,   95,   95,   95,   95,   95,   94,
          94,   94,   93,   93,   93,   93,   93,   92,   92,   92,   92,   92,
          92,   91,   91,   91]),
indices=tensor([ 5501,  1207,  2808,  4103,  6691, 15648,  8344,  9323, 10172, 19983,
         7674,  7667,  9009,  7668,  7672,  6054, 10157, 15643,  7061,  1208,
         6706, 14223,  7670,  2812, 11141,  6773,  7673,  9981,  2251,  3186,
         1212, 13646,  6374,  1019,  6039, 14577,  2

In [25]:
argmax(lengths)

5501

In [35]:
lengths[7672]

185

In [41]:
final_aanns[15023]

{'idx': 15024,
 'source': 'openbookscorpus',
 'sentence': "In fact Sterling had a theory – he had a theory about a lot of things – that holding the iPhone was a very good substitute for holding one 's other thing ; granted the former was shorter ( 4.5 in , but small is good for hand - helds ) and heavier ( 4.8 oz ) , but it had a lot more useful functions and time - wise ( 5 hours of continuous activity , a whopping 300 hours on standby ) ; there was simply no comparison .",
 'construction': 'a whopping 300 hours',
 'pattern': 'DT JJ CD NNS',
 'DT': 'a',
 'ADJ': 'whopping',
 'NUMERAL': '300',
 'NOUN': 'hours',
 'ADV': ''}