In [1]:
import re
import utils
import spacy.lang.en as en

from tqdm import tqdm
from collections import defaultdict

In [2]:
nlp = en.English()
tokenizer = nlp.tokenizer



In [3]:
postags = utils.read_file("/home/km55359/rawdata/babylm_data/babylm_100M/sents/postags.txt")
sents = utils.read_file("/home/km55359/rawdata/babylm_data/babylm_100M/sents/babylm_sents.txt")

In [4]:
aadj_regex = re.compile(r"\bDT (JJ|JJR|JJS)")
anum_regex = re.compile(r"\bDT CD")
adjnum_regex = re.compile(r"\b(JJ|JJR|JJS) CD")

In [5]:
# def get_aadj():
#     aadj_sents = []
#     for i in range(len(postags)):
#         if aadj_regex.search(postags[i]):
#             aadj_sents.append(sents[i])
#     return aadj_sents
aadjs = []
anums = []
adjnums = []
aadjs_and_adjnums = []

for i, batch in enumerate(tqdm(zip(sents, postags))):
    sent, postag = batch
    sent_split, postag_split = [t.text for t in tokenizer(sent)], postag.split()
    
    aadj = aadj_regex.search(postag)
    if aadj:
        idx = len(postag[:aadj.start()].split())
        det = sent_split[idx]
        adj = sent_split[idx+1]
        if det.lower() in ["a", "an", "another"]:
            aadjs.append({
                'sentence_idx': i,
                'DT': det,
                'ADJ': adj,
                'sentence': sent
            })

    anum = anum_regex.search(postag)
    if anum:
        idx = len(postag[:anum.start()].split())
        det = sent_split[idx]
        num = sent_split[idx+1]
        if det.lower() in ["a", "an", "another"]:
            anums.append({
                'sentence_idx': i,
                'DT': det,
                'NUMERAL': num,
                'sentence': sent
            })
    
    adjnum = adjnum_regex.search(postag)
    if adjnum:
        idx = len(postag[:adjnum.start()].split())
        adj = sent_split[idx]
        num = sent_split[idx+1]
        adjnums.append({
            'sentence_idx': i,
            'ADJ': adj,
            'NUMERAL': num,
            'sentence': sent
        })

    if aadj and adjnum:
        if det.lower() in ["a", "an", "another"]:
            aadjs_and_adjnums.append({
                'sentence_idx': i,
                'DT': det,
                'ADJ': adj,
                'NUMERAL': num,
                'sentence': sent
            })
        


0it [00:00, ?it/s]

11632617it [03:24, 56982.77it/s]


In [6]:
len(anums), len(aadjs), len(adjnums), len(aadjs_and_adjnums)

(42111, 613985, 42777, 7108)

In [11]:
613985 - 42111

571874

In [7]:
'''
Maybe AANNs are learned because AA is more frequent than AN --> so what if we remove AAs such that there are equal number of AAs and ANs?
'''
# aadjs_and_adjnums

'\nMaybe AANNs are learned because AA is more frequent than AN --> so what if we remove AAs such that there are equal number of AAs and ANs?\n'

In [12]:
''' 
Count numerals from anums and adjs from aadjs, such that the number of times the most frequent numeral succeeds the indef article is the same as the number of times the most frequent adjective succeeds the indef article.
'''

frequency = utils.read_csv_dict("../data/babylm-analysis/babylm-unigrams.csv")
freq_table = {}
for entry in frequency:
    freq_table[entry['word']] = int(entry['count'])

In [13]:
freq_table['hundred']

17810

In [14]:
by_nums = defaultdict(list)
by_adjs = defaultdict(list)

for entry in aadjs:
    by_adjs[entry['ADJ'].lower()].append(entry)

for entry in anums:
    by_nums[entry['NUMERAL'].lower()].append(entry)

by_nums = dict(by_nums)
by_adjs = dict(by_adjs)

# by_num_freq = {k: len(v) for k, v in by_nums.items()}
# by_adj_freq = {k: len(v) for k, v in by_adjs.items()}

sorted_by_nums = sorted(by_nums.items(), key = lambda x: len(x[1]), reverse=True)
sorted_by_adjs = sorted(by_adjs.items(), key = lambda x: len(x[1]), reverse=True)

In [15]:
def get_frequency(word):
    try:
        return freq_table[word]
    except: 
        return 0

In [16]:
import random

random.seed(42)

sampled_adjs = []

for ordered_adj, ordered_num in tqdm(zip(sorted_by_adjs, sorted_by_nums)):
    to_sample = min(len(ordered_adj[1]), len(ordered_num[1]))
    if to_sample == 0:
        continue
    sampled_adj = random.sample(ordered_adj[1], to_sample)
    obj = (ordered_adj[0], sampled_adj)
    sampled_adjs.append(obj)

2129it [00:00, 110803.74it/s]


In [17]:
len(sampled_adjs)

2129

In [18]:
flattened_adjs = []
for sa in sampled_adjs:
    flattened_adjs.extend(sa[1])

len(flattened_adjs)

42111

In [19]:
flattened_nums = []
for sn in sorted_by_nums:
    flattened_nums.extend(sn[1])

len(flattened_nums)

42111

In [20]:
flattened_nums

[{'sentence_idx': 5359,
  'DT': 'a',
  'NUMERAL': 'hundred',
  'sentence': 'a hundred projects that will disappear in three months.'},
 {'sentence_idx': 8208,
  'DT': 'a',
  'NUMERAL': 'hundred',
  'sentence': 'Jerry is on his way to Tokyo to check them out, and probably every geisha within a hundred miles.'},
 {'sentence_idx': 9551,
  'DT': 'a',
  'NUMERAL': 'hundred',
  'sentence': 'I support that a hundred percent.'},
 {'sentence_idx': 13369,
  'DT': 'a',
  'NUMERAL': 'hundred',
  'sentence': 'Jerry is on his way to Tokyo to check them out, and probably every geisha within a hundred miles.'},
 {'sentence_idx': 13783,
  'DT': 'a',
  'NUMERAL': 'hundred',
  'sentence': "Erm the first thing I'm,I I would like t t to draw out is that at the end of the year we made a profit of a hundred sixty thousand pounds."},
 {'sentence_idx': 13791,
  'DT': 'A',
  'NUMERAL': 'hundred',
  'sentence': 'A hundred and ninety.'},
 {'sentence_idx': 13801,
  'DT': 'A',
  'NUMERAL': 'hundred',
  'sentence': 

In [21]:
adj_and_num_ids = [f['sentence_idx'] for f in flattened_adjs] + [f['sentence_idx'] for f in flattened_nums]

adjs_to_remove = [entry for entry in aadjs if entry['sentence_idx'] not in adj_and_num_ids]

In [24]:
len(adjs_to_remove)

568876

In [27]:
adjs_and_nums = [entry for entry in aadjs if entry['sentence_idx'] in adj_and_num_ids]

KeyboardInterrupt: 

In [25]:
adj_and_num_ids

[10668887,
 1536384,
 311290,
 4508480,
 3917794,
 3478885,
 2050852,
 1371143,
 11354072,
 8976860,
 1131128,
 9823550,
 6911535,
 388992,
 366230,
 1228590,
 3360993,
 3672108,
 8231332,
 10014696,
 324419,
 9263961,
 3039296,
 10868707,
 8970657,
 6864068,
 3405926,
 7341510,
 9805990,
 4566974,
 83904,
 2387555,
 6933596,
 5684478,
 4561161,
 2316618,
 3304691,
 5620320,
 1366545,
 1219858,
 6288060,
 1273372,
 5988051,
 5742114,
 10051511,
 4318650,
 541334,
 7485779,
 8800638,
 1782899,
 6270636,
 1005724,
 9112073,
 4828642,
 10445835,
 10274753,
 6024060,
 9575167,
 2967488,
 892553,
 577391,
 11048564,
 3569146,
 4762936,
 1015656,
 3673795,
 1343423,
 6291430,
 4563521,
 7402234,
 10597350,
 6064645,
 2458119,
 6140560,
 5928161,
 3193764,
 11208749,
 4373140,
 11460367,
 10823974,
 914030,
 10125240,
 10583089,
 2618666,
 8758961,
 3914538,
 2471783,
 7533804,
 6281111,
 4428315,
 10677987,
 11533597,
 9180875,
 3380596,
 11476494,
 5413774,
 725699,
 3599743,
 392415,
 5253

In [29]:
# import pandas as pd

# adjs_to_remove_df = pd.DataFramae(adjs_to_remove)

# adjs_to_remove_df.to_csv("../data/babylm-analysis/adjs_to_remove.csv", index=False)

In [30]:
adjs_to_remove_df

Unnamed: 0,sentence_idx,DT,ADJ,sentence
0,6,a,single,This is because with this method you remain on...
1,39,a,little,"So you kind of have to, a little bit, coddle t..."
2,46,an,old,"Watergate hearings, when he'd go, well, I'll j..."
3,50,a,little,And so what we fraudsters do is we act a littl...
4,75,a,few,Out of a few hundred million dollars worth of ...
...,...,...,...,...
568871,11632367,A,FIRST,THIS MUST BE A FIRST.
568872,11632408,A,PRETTY,THESE PHOTOGRAPHS GIVE A PRETTY GRAPHIC PICTURE.
568873,11632441,A,NICE,"HE'S A NICE GUY, DAVID."
568874,11632442,A,NICE,I KNOW HE'S A NICE GUY.


In [17]:
# import pandas as pd

# num_df = pd.DataFrame(flattened_nums)
# adj_df = pd.DataFrame(flattened_adjs)

# num_df.to_csv("../data/babylm-analysis/a_nums.csv", index=False)
# adj_df.to_csv("../data/babylm-analysis/a_adjs.csv", index=False)

Unnamed: 0,sentence_idx,DT,NUMERAL,sentence
0,5359,a,hundred,a hundred projects that will disappear in thre...
1,8208,a,hundred,Jerry is on his way to Tokyo to check them out...
2,9551,a,hundred,I support that a hundred percent.
3,13369,a,hundred,Jerry is on his way to Tokyo to check them out...
4,13783,a,hundred,"Erm the first thing I'm,I I would like t t to ..."
...,...,...,...,...
42106,11518920,a,452.1C,"In fact, if Chief Pope were to identify these ..."
42107,11561237,a,42.9,Dancing with a 42.9 degree temperature?
42108,11596406,a,1920s,Her new book included a 1920s essay about an I...
42109,11620232,A,289,A 289 with dual carbs and a pony interior.


In [28]:
adjnums

[{'sentence_idx': 75,
  'ADJ': 'few',
  'NUMERAL': 'hundred',
  'sentence': "Out of a few hundred million dollars worth of checks that were written, the auditors never saw one canceled cashier's check."},
 {'sentence_idx': 241,
  'ADJ': 'obvious',
  'NUMERAL': 'one',
  'sentence': 'Let me just state one super obvious one.'},
 {'sentence_idx': 247,
  'ADJ': 'last',
  'NUMERAL': 'two',
  'sentence': "The last two reasons I'm going to give you might sound a little flipping but, you know, I, I think there is more than a grain of truth to both of them."},
 {'sentence_idx': 286,
  'ADJ': 'final',
  'NUMERAL': 'two',
  'sentence': 'And you compute the final two partial products using the two and the one.'},
 {'sentence_idx': 341,
  'ADJ': 'first',
  'NUMERAL': 'two',
  'sentence': 'The first two digits of the four in x.'},
 {'sentence_idx': 342,
  'ADJ': 'other',
  'NUMERAL': 'two',
  'sentence': 'And then b would be the other two digits.'},
 {'sentence_idx': 384,
  'ADJ': 'first',
  'NUMERAL