In [1]:
import csv
import editors
import random
import utils
import pandas as pd

from collections import defaultdict
from nltk.tokenize import sent_tokenize
from constructions import AANN
from functools import reduce
from ordered_set import OrderedSet


In [2]:
random.seed(42)

In [3]:
def compose(*functions):
    return reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)

In [4]:
# read babylm aann data
aanns = []
with open("/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_data.csv", "r") as f:
    reader = csv.DictReader(f)
    for line in reader:
        aanns.append(line)

aanns_non_a_an = []
with open("/home/km55359/rawdata/babylm_data/babylm_100M/aanns/aann_all_det_data.csv", "r") as f:
    reader = csv.DictReader(f)
    for line in reader:
        if line['DT'].lower() not in ['a', 'an', 'another']:
            aanns_non_a_an.append(line)

babylm_sents = []
with open("/home/km55359/rawdata/babylm_data/babylm_100M/sents/babylm_sents.txt", "r") as f:
    for line in f:
        babylm_sents.append(line.strip())

In [5]:
for aann in aanns:
    construction = utils.parse_instance(aann)
    if construction.string not in aann['sentence']:
        print(aann)

{'source': 'open_subtitles', 'sentence': "I'll give him a fuckin' 30 days if I get my hands on him.", 'sentence_idx': '4347718', 'construction': "a fuckin ' 30 days", 'pattern': 'DT JJ JJ CD NNS', 'DT': 'a', 'ADJ': "fuckin & '", 'NUMERAL': '30', 'NOUN': 'days', 'ADV': ''}
{'source': 'qed', 'sentence': 'And each of those feet is made up of a dactyl, a long and two shorts, a dum [unknown].', 'sentence_idx': '630844', 'construction': 'a long and two shorts', 'pattern': 'DT JJ CC CD NNS', 'DT': 'a', 'ADJ': 'long', 'NUMERAL': 'two', 'NOUN': 'shorts', 'ADV': ''}
{'source': 'qed', 'sentence': 'So, the basic element of Homeric epic poetry comes down to a long and two shorts.', 'sentence_idx': '630845', 'construction': 'a long and two shorts', 'pattern': 'DT JJ CC CD NNS', 'DT': 'a', 'ADJ': 'long', 'NUMERAL': 'two', 'NOUN': 'shorts', 'ADV': ''}
{'source': 'qed', 'sentence': "It's also the case that any of these dactyls a long and two shorts can be replaced by a spondee two longs.", 'sentence_id

In [6]:
def get_sents(aann_collection, editor=None):
    aann_sents = []
    found = False
    for aann in aann_collection:
        construction = utils.parse_instance(aann)
        if construction.string not in aann['sentence']:
            pass
        else:
            if editor is not None:
                construction = editor(construction)
                sentence = aann['sentence'].replace(aann['construction'], construction.string)
            else:
                sentence = aann['sentence']
            sents = sent_tokenize(sentence)
            if len(sents) == 1:
                aann_sents.append(sents[0])
                continue
            else:
                for sent in sents:
                    if construction.string in sent:
                        aann_sents.append(sent)
                        found=True
            if not found:
                print(aann['construction'], aann['sentence'])
            # print(aann['construction'], sent) 

    return list(OrderedSet(aann_sents))

In [7]:
sents_aann = get_sents(aanns)
sents_nan = get_sents(aanns, editor=compose(editors.corrupt_order, editors.corrupt_article)) # randomly sample 500
sents_nan = random.sample(sents_nan, int(len(sents_aann)/2))
sents_no_a_an = get_sents(aanns_non_a_an) # randomly sample same length as sents_aann
sents_no_a_an = random.sample(sents_no_a_an, len(sents_aann))

In [8]:
# avg length of aann sents
print(sum([len(sent.split()) for sent in sents_aann])/len(sents_aann))

21.1976401179941


In [9]:
# randonly sample sents from babylm
random.shuffle(babylm_sents)
babylm_sampled_sents = []
for sent in babylm_sents:
    splitted = sent.split()
    if len(splitted) > 7 and len(splitted) < 22:
        if sent not in sents_aann:
            if len(babylm_sampled_sents) < 2 * len(sents_aann):
                babylm_sampled_sents.append(sent)

In [10]:
# data split?
# 1021 aann sents
# 500 aann but with non target determiners
# 1000 randomly sampled sentences
# 1021 aann sents but with nan construction (w/ corruption)

len(sents_aann), len(sents_nan), len(sents_no_a_an), len(babylm_sampled_sents)

(1017, 508, 1017, 2034)

In [11]:
dataset = {
    'sentence': sents_aann + sents_nan + sents_no_a_an + babylm_sampled_sents,
    'label': [1] * len(sents_aann) + [0] * len(sents_nan) + [0] * len(sents_no_a_an) + [0] * len(babylm_sampled_sents)
}

dataset = pd.DataFrame(dataset)
dataset.to_csv("../data/aann_classifier_data.csv", index=False)

In [12]:
dataset

Unnamed: 0,sentence,label
0,We're talking a few thousand dollars!,1
1,"Mister, you wouldn't want to own a dog that co...",1
2,"I see .. all in all, a fine eighteen months.",1
3,"And to blow up the bathtub, we're stretching t...",1
4,We were filming for a good 12 hours or so.,1
...,...,...
4571,At creating kind of off-balance sheet entities...,0
4572,Why don't you talk to her about it.,0
4573,friend inquired whether he had yet heard about...,0
4574,Do you know that your mother was married befor...,0
