In [124]:
import torch
import transformers
import numpy as np
from tqdm import tqdm

In [10]:
old_tokenizer = transformers.AutoTokenizer.from_pretrained("roberta-base")

In [11]:
def get_training_corpus():
    global c
    c = 0
    with open('../data/babylm_10M/train.txt') as f:
        for line in f:
            c += 1
            yield line.strip()

In [12]:
new_tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), 16_000)

In [13]:
new_tokenizer.save_pretrained("babylm_bpe_tokenizer_16k")

('babylm_bpe_tokenizer_16k/tokenizer_config.json',
 'babylm_bpe_tokenizer_16k/special_tokens_map.json',
 'babylm_bpe_tokenizer_16k/vocab.json',
 'babylm_bpe_tokenizer_16k/merges.txt',
 'babylm_bpe_tokenizer_16k/added_tokens.json',
 'babylm_bpe_tokenizer_16k/tokenizer.json')

In [14]:
new_tokenizer.push_to_hub("babylm_bpe_tokenizer_16k", use_temp_dir=True)

Cloning https://huggingface.co/omarmomen/babylm_bpe_tokenizer_16k into local empty directory.
To https://huggingface.co/omarmomen/babylm_bpe_tokenizer_16k
   4286ea9..6200549  main -> main



'https://huggingface.co/omarmomen/babylm_bpe_tokenizer_16k/commit/6200549f326db3c222731f1fa913a20618dc02d6'

In [22]:
c = 0
for line in tqdm(open('../data/babylm_10M/train.txt')):
    new_tokens = new_tokenizer.encode(line.strip(), add_special_tokens=False, truncation=True, max_length=500)
    if len(new_tokens) > 500:
        c = c + 1

1015494it [00:32, 30915.98it/s]


In [1]:
import sys
sys.path.append("..")
import data_penn
import data_ptb
import data_ptb_subword
import numpy as np
import torch
import seaborn as sns
import tree_utils
import collections
import nltk
import os
import matplotlib as mpl
from nltk.parse import DependencyGraph
import matplotlib.pyplot as plt
import benepar
import spacy
nlp = spacy.load('en_core_web_lg')
if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3_large"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3_large"})
from tqdm import tqdm
device = 'cuda'

In [2]:
parser = benepar.Parser("benepar_en3_large")

In [3]:
model = torch.load("../trained_models/babylm_1111_in_parser_sf.pt")[0]

In [4]:
dataset = data_ptb_subword.SubWord_Corpus_Custom("omarmomen/babylm_bpe_tokenizer_16k", "../data/babylm_10M/")

Tokenizing data...


1015494it [00:43, 23463.12it/s]


Tokenizing data...


96105it [00:04, 22144.17it/s]


Tokenizing data...


95681it [00:04, 19587.26it/s]


In [7]:
def tokenize_tree(tree, tokenizer, flag):
    if isinstance(tree, list):
        tokenized_tree = []
        for child in tree:
            if isinstance(child, str):
                if flag:
                    subwords = tokenizer.tokenize(child)
                else:
                    if child in ["'s"]:
                        subwords = [child]
                    else:
                        subwords = tokenizer.tokenize(' '+child)
                flag = False
                subwords = subwords[0] if len(subwords) == 1 else subwords
                tokenized_tree.append(subwords)
            else:
                t, flag = tokenize_tree(child, tokenizer, flag) 
                tokenized_tree.append(t)
        return tokenized_tree, flag
    return tree, flag

In [5]:
NEG_POSS = ['n\'t', '\'s', '\'m', '\'re', '\'ve', '\'d', '\'ll', '%']
PUNCT = ['..', '...',  ',,', ',,,', '.', ',', '?', '!', ':', ';', '-', '(', ')', '[', ']', '{', '}', '<', '>', '/', '\\', '|', '_', '@', '#', '$', '^', '&', '*', '~', '`', '+', '=', '"', "'"]

In [6]:
def tree2list(tree):
    if isinstance(tree, nltk.Tree):
        root = []
        for child in tree:
            c = tree2list(child)
            if c:
                root.append(c)
        if len(root) > 1:
            return root
        elif len(root) == 1:
            return root[0]
        if len(tree.leaves())>0:
            return tree.leaves()[0]
        else:
            return ' '
    return []

def recall_subword_constituents_custom(induced_tree_list, induced_tree_consts):
    
    def flatten_tree(tree):
        """Flatten a tree of strings represented as nested lists into a flat list of strings."""
        flat_list = []

        def flatten(subtree):
            for item in subtree:
                if isinstance(item, list):
                    flatten(item)  # Recursive call if the item is a list
                else:
                    flat_list.append(item)  # Add the string to the flat list

        flatten(tree)
        return flat_list
    
    flat_tree_list = flatten_tree(induced_tree_list)
    true_subword_constituents = set()
    for i, token in enumerate(flat_tree_list):
        if (i+1) < len(flat_tree_list):
            if token.startswith("Ġ") or i==0:
                if not flat_tree_list[i+1].startswith("Ġ"):
                    for j, next_token in enumerate(flat_tree_list[i+1:]):
                        if next_token.startswith("Ġ"):
                            break 
                    if (i+j+1) == len(flat_tree_list)-1:
                        flag = False
                        for sub_token in flat_tree_list[i+1:i+j+2]:
                            # only consider a subword constituent if it contains at least one letter
                            for char in sub_token:
                                if char.isalpha():
                                    flag = True
                                    break
                        if flag:
                            if (not flat_tree_list[i+j+1].isalpha()) and j>0:
                                true_subword_constituents.add((i,i+j+1))
                            else:
                                true_subword_constituents.add((i,i+j+2))
                    else:
                        flag = False
                        for sub_token in flat_tree_list[i+1:i+j+1]:
                            # only consider a subword constituent if it contains at least one letter
                            for char in sub_token:
                                if char.isalpha():
                                    flag = True
                                    break
                        if flag:
                            if not flat_tree_list[i+j].isalpha() and j>1:
                                true_subword_constituents.add((i,i+j))
                            else:
                                true_subword_constituents.add((i,i+j+1))
    total_subword_constituents = len(true_subword_constituents)
    correct_subword_constituents = len(induced_tree_consts.intersection(true_subword_constituents))
    
    if total_subword_constituents == 0 or len(induced_tree_consts) == 0:
        return 1.0, total_subword_constituents
    else:
        return (correct_subword_constituents/total_subword_constituents), total_subword_constituents


In [8]:
prec_list = []
reca_list = []
f1_list = []
subword_reca_list = []
n_subword_const_list = []
n_total_induced_const_list = []
n_total_ref_const_list = []

for i, sent in tqdm(enumerate(dataset.test_sens)):
    # get the induced tree
    x = torch.LongTensor([dataset.test[i]]).to(device)
    pos = torch.LongTensor([list(range(len(dataset.test[i])))]).to(device)
    tokens = dataset.tokenizer.tokenize(dataset.test_sens[i], add_special_tokens=False, truncation=True, max_length=500)
    with torch.no_grad():
        output, p_dict = model(x, pos)
    distance = p_dict['distance'][0].detach().cpu().numpy()
    height = p_dict['height'][0].detach().cpu().numpy()
    induced_tree_list = tree_utils.build_tree(distance, tokens)
    induced_tree_consts, m = tree_utils.get_brackets(induced_tree_list)
    
    # get the reference tree
    words = []
    spaces = []
    for token in tokens:
        if token.startswith("Ġ"):
            if token == "Ġ" or token == "ĠĠ":
                words.append(" ")
            else:
                words.append(token[1:])
            spaces.append(True)
        else:
            words.append(token)
            spaces.append(False)
    spaces = spaces[1:]
    spaces.append(False)
    try:
        input_sentence1 = benepar.InputSentence(
            words=words,
            space_after=spaces
        )
        ref_tree = parser.parse(input_sentence1)
    except Exception as e:
        print("Sentence cannot be parsed, skipping..")
        print(f"{i}: {sent}")
        print(e)
        continue
    ref_tree_list = tree2list(ref_tree)
    ref_tree_consts, n = tree_utils.get_brackets(ref_tree_list)
    if m != n:
        print(f"brackets don't match.. {m} != {n}")
        print(f"{i}: {sent}")
        print(induced_tree_list)
        print(ref_tree_list)
        raise ValueError
    
    overlap = induced_tree_consts.intersection(ref_tree_consts)
    
    prec = float(len(overlap)) / (len(induced_tree_consts) + 1e-8)
    reca = float(len(overlap)) / (len(ref_tree_consts) + 1e-8)
    if not ref_tree_consts:
        reca = 1.
        if not induced_tree_consts:
            prec = 1.
    f1 = 2 * prec * reca / (prec + reca + 1e-8)
    
    subword_reca, n_subword_const = recall_subword_constituents_custom(induced_tree_list, induced_tree_consts)
    
    # print(prec, reca, f1, subword_reca, n_subword_const)
    # print(induced_tree_list)
    # print(ref_tree_list)
    # print("=====================================")
    
    prec_list.append(prec)
    reca_list.append(reca)
    f1_list.append(f1)
    subword_reca_list.append(subword_reca)
    n_subword_const_list.append(n_subword_const)
    n_total_induced_const_list.append(len(induced_tree_consts))
    n_total_ref_const_list.append(len(ref_tree_consts))

print(f"precision: {np.mean(prec_list)}")
print(f"recall: {np.mean(reca_list)}")
print(f"f1: {np.mean(f1_list)}")
print(f"subword recall: {np.mean(subword_reca_list)}")
print(f"n_subword_const: {np.sum(n_subword_const_list)}")
print(f"n_total_induced_const: {np.sum(n_total_induced_const_list)}")
print(f"n_total_ref_const: {np.sum(n_total_ref_const_list)}")


0it [00:00, ?it/s]

  'with `validate_args=False` to turn off validation.')
28814it [38:44, 12.20it/s]

Sentence cannot be parsed, skipping..
28811: Kristen Stewart was on the set of "Adventureland" when Hardwicke visited her for an informal screen test that "captivated" the director. Hardwicke had trouble finding an actor otherworldly enough to play vampire Edward Cullen. Then she got a call about a guy in London. "I looked at a couple pictures and was like, ‘I’m not sure,’?" Hardwicke says. "He had been fired from his last job, he was unemployed, he was in debt." Pattinson flew to Los Angeles on his own dime to read with Stewart. Shiloh Fernandez, Jackson Rathbone, Ben Barnes, and Robert Pattinson were the final four up for the role of Edward. Hardwicke did not initially choose Robert Pattinson for the role of Edward Cullen, but after an audition at her home with Stewart, he was selected. Hardwicke said, "Kristen was like, ‘It’s got to be Rob!’ She felt connected to him from the first moment. That electricity, or love at first sight, or whatever it is." Hardwicke gave him the part, but

40165it [54:04, 11.19it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
40169it [54:05, 11.75it/s]

Sentence cannot be parsed, skipping..
40166: However, as is most often the case with film adaptations, differences do exist between the film and source material. Certain scenes from the book were cut from the film, such as a biology room scene where Bella's class does blood typing. Hardwicke explains, "Well [the book is] almost 500 pages—you do have to do the sweetened condensed milk version of that... We already have two scenes in biology: the first time they're in there and then the second time when they connect. For a film, when you condense, you don't want to keep going back to the same setting over and over. So that's not in there." The settings of certain conversations in the book were also changed to make the scenes more "visually dynamic" on-screen, such as Bella's revelation that she knows Edward is a vampire—this happens in a meadow in the film instead of in Edward's car as in the novel. A biology field trip scene is added to the film to condense the moments of Bella's frustr

44956it [59:52, 11.48it/s]

Sentence cannot be parsed, skipping..
44954: last question and i'm going to get into subjective i'd like to cover before the end it you talk about and seminar cannot star emotional came instead enacting and uh... did you use the nancy families and suffering yes experienced nightmares matter uh... yes and other people worse than it was for yourself and you were participant well you know abhorred for example if you follow me and my work you would see this very clearly in places like rwanda thoroughly sierra leon i'm working with people that have had their families some of those people have such rage that only lived for a moment by moment is the possibility of engines others have knowing had never had c_n_n_'s extended they had deep feelings not great so it is not the stimulus that determines how r u motion reaction that part is up to us i work with some women unfortunately a lot who've been raped and some of them feel shame deep shag some feel rage some field other things the same stimul

49051it [1:05:08, 13.64it/s]

Sentence cannot be parsed, skipping..
49049: In 2008 and 2009, Moore traveled to Detroit to portray in photographs “the idea that in an urban setting you could also have a landscape happening, the forces of nature intersecting with American urbanism, the process of decline also intersecting with the revival of nature.” In 2010, Moore released Detroit Disassembled (Damiani, 2010), with an introduction by Detroit-native and Poet Laureate Philip Levine, to coincide with an exhibition at the Akron Art Museum. He was originally invited to document the city by two young French photographers, Yves Marchand and Romain Meffre, who had been photographing Detroit’s abandoned spaces since 2005. While Moore’s Detroit series follows the themes of transformation and decaying space explored in previous bodies of work, his focus on the motor city generated controversy in the pages of "The New Republic" and the journal "Guernica". The photographs were decried as “ruin porn,” which Mike Rubin defined in 

51728it [1:08:26, 14.16it/s]

Sentence cannot be parsed, skipping..
51725: list where u what you say to yourself when you're less than perfect now for each judgment think of what might be in the stimulus for we've got to relate each of these to a specific context thirty years i must say the first thing is you have in your list mister barone when a dumb thing to do it what you might have done to stimulate then put out here at the news and here than ny stupid i'm saying that all judgments a tragic expressions of unmet need ask yourself if i when i say that to myself in that situation house too what we need m_i_a_ expressing through that judgment what we have mine isn't getting met and here's where you can use the list in my my book if you can't come up with it yourself first look through the list and your body will tell you when you didn't call ripken's uh... yeah yeah yeah that's what made need comes much closer to the truth than any judgment unit so do that for every item on the list what you tell yourself when you

65177it [1:25:15, 12.19it/s]

Sentence cannot be parsed, skipping..
65175: “Podkoleosin” [A character in Gogol’s comedy, The Wedding.] was perhaps an exaggeration, but he was by no means a non-existent character; on the contrary, how many intelligent people, after hearing of this Podkoleosin from Gogol, immediately began to find that scores of their friends were exactly like him! They knew, perhaps, before Gogol told them, that their friends were like Podkoleosin, but they did not know what name to give them. In real life, young fellows seldom jump out of the window just before their weddings, because such a feat, not to speak of its other aspects, must be a decidedly unpleasant mode of escape; and yet there are plenty of bridegrooms, intelligent fellows too, who would be ready to confess themselves Podkoleosins in the depths of their consciousness, just before marriage. Nor does every husband feel bound to repeat at every step, “Tu l’as voulu, Georges Dandin!” like another typical personage; and yet how many milli

66017it [1:26:20, 12.32it/s]

Sentence cannot be parsed, skipping..
66015: He was an elderly man, of large and majestic person, and strong, square features, betokening a steady soul; but steady as it was, his enemies had found means to shake it. His face was pale as death, and far more ghastly; the broad forehead was contracted in his agony, so that his eyebrows formed one grizzled line; his eyes were red and wild, and the foam hung white upon his quivering lip. His whole frame was agitated by a quick and continual tremor, which his pride strove to quell, even in those circumstances of overwhelming humiliation. But perhaps the bitterest pang of all was when his eyes met those of Robin; for he evidently knew him on the instant, as the youth stood witnessing the foul disgrace of a head grown gray in honor. They stared at each other in silence, and Robin's knees shook, and his hair bristled, with a mixture of pity and terror. Soon, however, a bewildering excitement began to seize upon his mind; the preceding adventure

67501it [1:28:14, 14.15it/s]

Sentence cannot be parsed, skipping..
67498: [female]well you have in the past, all i am saying is that [female]if you are headed that way, i mean ... [female] all i'm saying is .i dont want to be the one, i dont want to be the blame for you not going to work [female] i have nothing to do with it, [female] i am not making you stay here continuing arguing [female] i saying that if you need to go to work that's fine, i understand and respect that [female] it dont matter how mad i am at you or not [female] if you got to go to work then you got to go to work [female] it has nothing to do with anything [female] i just dont fight anymore [female] i dont want....... and blowing all up... [male]..you where the one... [female] not i dont, you where the one that started all of the fight this morning [female] i have no intrest in fighting with you at all [female] your asking me questions .. i didnt say...[female] im giving you answers [male] i didnt say.. i didnt say anything about who started it

95681it [2:03:07, 12.95it/s]

precision: 0.27150490622138335
recall: 0.5538122292072093
f1: 0.3211875279557198
subword recall: 0.6808046866840887
n_subword_const: 153749
n_total_induced_const: 1177663
n_total_ref_const: 658562





In [None]:
# 28817it [10:31, 44.46it/s]
# Sentence cannot be parsed, skipping..
# 28811: Kristen Stewart was on the set of "Adventureland" when Hardwicke visited her for an informal screen test that "captivated" the director. Hardwicke had trouble finding an actor otherworldly enough to play vampire Edward Cullen. Then she got a call about a guy in London. "I looked at a couple pictures and was like, ‘I’m not sure,’?" Hardwicke says. "He had been fired from his last job, he was unemployed, he was in debt." Pattinson flew to Los Angeles on his own dime to read with Stewart. Shiloh Fernandez, Jackson Rathbone, Ben Barnes, and Robert Pattinson were the final four up for the role of Edward. Hardwicke did not initially choose Robert Pattinson for the role of Edward Cullen, but after an audition at her home with Stewart, he was selected. Hardwicke said, "Kristen was like, ‘It’s got to be Rob!’ She felt connected to him from the first moment. That electricity, or love at first sight, or whatever it is." Hardwicke gave him the part, but he had to make a promise. "You’ve got to realize that Kristen is 17 years old," Hardwicke told him, "She’s underage. You’ve got to focus, dude, or you’re going to be arrested. I made him swear on a stack of Bibles." Pattinson was unfamiliar with the novel series prior to his screen test but read the books later on. Meyer allowed him to view a manuscript of the unfinished "Midnight Sun", which chronicles the events in "Twilight" from Edward's point of view. Fan reaction to Pattinson's casting as Edward was initially negative; Rachelle Lefèvre remarked that "[e]very woman had their own Edward [that] they had to let go of before they could open up to [him], which they did." Meyer was "excited" and "ecstatic" in response to the casting of the two main characters. She had expressed interest in having Emily Browning and Henry Cavill cast as Bella and Edward, respectively, prior to pre-production.
# Sentence of length 513 (in sub-word tokens) exceeds the maximum supported length of 512
# 40165it [14:40, 44.69it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
# 40175it [14:40, 45.34it/s]
# Sentence cannot be parsed, skipping..
# 40166: However, as is most often the case with film adaptations, differences do exist between the film and source material. Certain scenes from the book were cut from the film, such as a biology room scene where Bella's class does blood typing. Hardwicke explains, "Well [the book is] almost 500 pages—you do have to do the sweetened condensed milk version of that... We already have two scenes in biology: the first time they're in there and then the second time when they connect. For a film, when you condense, you don't want to keep going back to the same setting over and over. So that's not in there." The settings of certain conversations in the book were also changed to make the scenes more "visually dynamic" on-screen, such as Bella's revelation that she knows Edward is a vampire—this happens in a meadow in the film instead of in Edward's car as in the novel. A biology field trip scene is added to the film to condense the moments of Bella's frustration at trying to explain how Edward saved her from being crushed by a van. The villainous vampires are introduced earlier in the film than in the novel. Rosenberg said that "you don't really see James and the other villains until to the last quarter of the book, which really won't work for a movie. You need that ominous tension right off the bat. We needed to see them and that impending danger from the start. And so I had to create back story for them, what they were up to, to flesh them out a bit as characters." Rosenberg also combined some of the human high school students, with Lauren Mallory and Jessica Stanley in the novel becoming the character of Jessica in the film, and a "compilation of a couple of different human characters" becoming Eric Yorkie. About these variances from the book, Mooradian stated, "I think we did a really judicious job of distilling [the book]. Our greatest critic, Stephenie Meyer, loves the screenplay, and that tells me that we made all the right choices in terms of what to keep and what to lose. Invariably, you're going to lose bits and pieces that certain members of the audience are going to desperately want to see, but there's just a reality that we're not making 'Twilight: The Book' the movie."
# Sentence of length 528 (in sub-word tokens) exceeds the maximum supported length of 512
# 44960it [16:22, 45.13it/s]
# Sentence cannot be parsed, skipping..
# 44954: last question and i'm going to get into subjective i'd like to cover before the end it you talk about and seminar cannot star emotional came instead enacting and uh... did you use the nancy families and suffering yes experienced nightmares matter uh... yes and other people worse than it was for yourself and you were participant well you know abhorred for example if you follow me and my work you would see this very clearly in places like rwanda thoroughly sierra leon i'm working with people that have had their families some of those people have such rage that only lived for a moment by moment is the possibility of engines others have knowing had never had c_n_n_'s extended they had deep feelings not great so it is not the stimulus that determines how r u motion reaction that part is up to us i work with some women unfortunately a lot who've been raped and some of them feel shame deep shag some feel rage some field other things the same stimulus depends how people take it with me feel shame rage or other things working with the woman from rwanda meant she heard her three children being killed concei got underneath the sink hid underneath the sink in time children to make it to the hiding place in time they got killed she heard them your husband being you know brother she had to stay underneath it eleven days to save the life 'cause they stayed in the house after they killed and this woman has deep feelings but never once said she had the kind of thing that makes you want to get vengeance she's put all of her feelings and lots of into protecting preventing this happening to anybody else so the way she looked at it means here to want to prevent this happening to anybody else she came to my work yet because you want to have you with the rage toward her from other people in her trial who work here is we've heard it you don't want she will join their efforts killian faced in the last quite different reactions okay so i had a stimulus uh... to deal with him away had dealt with it and i'm going to change their house the worst thing of course would be no matter how you get used to deal with this to think it was something wrong with how you chose to do them not wanting us to get into one was right or wrong and just stand that no matter what happens to us the other persons responsible for what they do and i'm not saying the other person doesn't have a responsibility question about catholic and persons responsible for what they did what he did we're responsible for how we deal with that
# Sentence of length 547 (in sub-word tokens) exceeds the maximum supported length of 512
# 49054it [17:49, 40.84it/s]
# Sentence cannot be parsed, skipping..
# 49049: In 2008 and 2009, Moore traveled to Detroit to portray in photographs “the idea that in an urban setting you could also have a landscape happening, the forces of nature intersecting with American urbanism, the process of decline also intersecting with the revival of nature.” In 2010, Moore released Detroit Disassembled (Damiani, 2010), with an introduction by Detroit-native and Poet Laureate Philip Levine, to coincide with an exhibition at the Akron Art Museum. He was originally invited to document the city by two young French photographers, Yves Marchand and Romain Meffre, who had been photographing Detroit’s abandoned spaces since 2005. While Moore’s Detroit series follows the themes of transformation and decaying space explored in previous bodies of work, his focus on the motor city generated controversy in the pages of "The New Republic" and the journal "Guernica". The photographs were decried as “ruin porn,” which Mike Rubin defined in "The New York Times" as “urban decay as empty cliché, smacking of voyeurism and exploitation.” Curator Sarah Kennel writes in "The Memory of Time", an exhibition catalog from the National Gallery of Art, that, “in Moore’s photographs, ruination serves more explicitly as an allegory of modernity’s failure.” Other critics argue that whether or not Moore’s Detroit photographs fit the category of “ruin porn” is a matter of academic debate. Joseph Stanhope Cialdella argues in the journal "Environmental History" that Moore’s work instead conveys the “aesthetic of a postindustrial sublime” which “gives nature the authority to transform the image of Detroit into a novel, yet disturbing landscape that blurs the lines between wilderness and the city.” Dora Apel writes in "Beautiful Terrible Ruins" that Moore’s “pictures of Detroit tend to emphasize the relationship of nature and culture, with nature in the ascendancy.” Apel ultimately argues that the “ruin porn” images and debate fail to focus on the political and economic policies that are the root causes of the ruins.
# Sentence of length 535 (in sub-word tokens) exceeds the maximum supported length of 512
# 51732it [18:46, 46.88it/s]
# Sentence cannot be parsed, skipping..
# 51725: list where u what you say to yourself when you're less than perfect now for each judgment think of what might be in the stimulus for we've got to relate each of these to a specific context thirty years i must say the first thing is you have in your list mister barone when a dumb thing to do it what you might have done to stimulate then put out here at the news and here than ny stupid i'm saying that all judgments a tragic expressions of unmet need ask yourself if i when i say that to myself in that situation house too what we need m_i_a_ expressing through that judgment what we have mine isn't getting met and here's where you can use the list in my my book if you can't come up with it yourself first look through the list and your body will tell you when you didn't call ripken's uh... yeah yeah yeah that's what made need comes much closer to the truth than any judgment unit so do that for every item on the list what you tell yourself when you're angry at others again identified concretely what the other person might have done this stimulate then ask yourself this question when i judge people as idiot doing what you don't mind was not being met in that city again trying to get it without my list if you can't find it looked to my list the fine when it comes close the third with what others say to you to get your defense practice putting on the giraffe years imagine what you did the stimulated and in that situation guess what the other persons needs were that we're getting them so we feel it's just learning a new language learning for every time there are these jackal judgment through as quickly as possible to bring yourself back to life or more specifically connected in needs a lot requested and city i'd need another person you know expectations me yes well first of all never hear an expectation that's that's thoughts expectations are thoughts don't care don't even hear expectations here what that means what does that mean that the person is asking for you to me you know i don't live up to expectations but that is fun to meet me do you think every all of our ministry i don't think you have to do it this time will bill you know the people that could begin processing even if you could do it you may choose not to and that will be a problem the other person can hear at no it's the first few empathy for their feelings and that will lead them feeling at least at their feelings in these matters but then again that you have to know how to say no injuries kapiti
# Sentence of length 543 (in sub-word tokens) exceeds the maximum supported length of 512
# 65185it [23:36, 45.04it/s]
# Sentence cannot be parsed, skipping..
# 65175: “Podkoleosin” [A character in Gogol’s comedy, The Wedding.] was perhaps an exaggeration, but he was by no means a non-existent character; on the contrary, how many intelligent people, after hearing of this Podkoleosin from Gogol, immediately began to find that scores of their friends were exactly like him! They knew, perhaps, before Gogol told them, that their friends were like Podkoleosin, but they did not know what name to give them. In real life, young fellows seldom jump out of the window just before their weddings, because such a feat, not to speak of its other aspects, must be a decidedly unpleasant mode of escape; and yet there are plenty of bridegrooms, intelligent fellows too, who would be ready to confess themselves Podkoleosins in the depths of their consciousness, just before marriage. Nor does every husband feel bound to repeat at every step, “Tu l’as voulu, Georges Dandin!” like another typical personage; and yet how many millions and billions of Georges Dandins there are in real life who feel inclined to utter this soul-drawn cry after their honeymoon, if not the day after the wedding! Therefore, without entering into any more serious examination of the question, I will content myself with remarking that in real life typical characters are “watered down,” so to speak; and all these Dandins and Podkoleosins actually exist among us every day, but in a diluted form. I will just add, however, that Georges Dandin might have existed exactly as Molière presented him, and probably does exist now and then, though rarely; and so I will end this scientific examination, which is beginning to look like a newspaper criticism. But for all this, the question remains, what are the novelists to do with commonplace people, and how are they to be presented to the reader in such a form as to be in the least degree interesting? They cannot be left out altogether, for commonplace people meet one at every turn of life, and to leave them out would be to destroy the whole reality and probability of the story. To fill a novel with typical characters only, or with merely strange and uncommon people, would render the book unreal and improbable, and would very likely destroy the interest. In my opinion, the duty of the novelist is to seek out points of interest and instruction even in the characters of commonplace people.
# Sentence of length 535 (in sub-word tokens) exceeds the maximum supported length of 512
# 66021it [23:55, 44.04it/s]
# Sentence cannot be parsed, skipping..
# 66015: He was an elderly man, of large and majestic person, and strong, square features, betokening a steady soul; but steady as it was, his enemies had found means to shake it. His face was pale as death, and far more ghastly; the broad forehead was contracted in his agony, so that his eyebrows formed one grizzled line; his eyes were red and wild, and the foam hung white upon his quivering lip. His whole frame was agitated by a quick and continual tremor, which his pride strove to quell, even in those circumstances of overwhelming humiliation. But perhaps the bitterest pang of all was when his eyes met those of Robin; for he evidently knew him on the instant, as the youth stood witnessing the foul disgrace of a head grown gray in honor. They stared at each other in silence, and Robin's knees shook, and his hair bristled, with a mixture of pity and terror. Soon, however, a bewildering excitement began to seize upon his mind; the preceding adventures of the night, the unexpected appearance of the crowd, the torches, the confused din and the hush that followed, the spectre of his kinsman reviled by that great multitude, all this, and, more than all, a perception of tremendous ridicule in the whole scene, affected him with a sort of mental inebriety. At that moment a voice of sluggish merriment saluted Robin's ears; he turned instinctively, and just behind the corner of the church stood the lantern-bearer, rubbing his eyes, and drowsily enjoying the lad's amazement. Then he heard a peal of laughter like the ringing of silvery bells; a woman twitched his arm, a saucy eye met his, and he saw the lady of the scarlet petticoat. A sharp, dry cachinnation appealed to his memory, and, standing on tiptoe in the crowd, with his white apron over his head, he beheld the courteous little innkeeper. And lastly, there sailed over the heads of the multitude a great, broad laugh, broken in the midst by two sepulchral hems; thus, "Haw, haw, haw, hem, hem, haw, haw, haw, haw!"
# Sentence of length 559 (in sub-word tokens) exceeds the maximum supported length of 512
# 67503it [24:27, 46.11it/s]
# Sentence cannot be parsed, skipping..
# 67498: [female]well you have in the past, all i am saying is that [female]if you are headed that way, i mean ... [female] all i'm saying is .i dont want to be the one, i dont want to be the blame for you not going to work [female] i have nothing to do with it, [female] i am not making you stay here continuing arguing [female] i saying that if you need to go to work that's fine, i understand and respect that [female] it dont matter how mad i am at you or not [female] if you got to go to work then you got to go to work [female] it has nothing to do with anything [female] i just dont fight anymore [female] i dont want....... and blowing all up... [male]..you where the one... [female] not i dont, you where the one that started all of the fight this morning [female] i have no intrest in fighting with you at all [female] your asking me questions .. i didnt say...[female] im giving you answers [male] i didnt say.. i didnt say anything about who started it [male] i woke up angry because of how you treated me [female]i didnt say any...[male] how you treated me this morning [female] how did i treat you this morning [female] i put your hand up [female]thats all i did [male] ok.. that mad me fucking angry [male] thats making me fucking angry..[female] i see that [female]then i offered m... then you told me told me to get the fuck away [female] but when i walked away you muddered on and on about me running away... [female] and from then it just got out of control [male] no it didnt [female] and now your gonna threaten me about my expressions [female] that i cant control [male] ya you can.. [female] nore should i have to [male] ya you can  [female] you could even go to prision for sticking your tongue out.
# Sentence of length 560 (in sub-word tokens) exceeds the maximum supported length of 512
# 95681it [34:40, 45.99it/s]
# precision: 0.35623490426393867
# recall: 0.7058749692624108
# f1: 0.4266083854094621
# subword recall: 0.780482448866233

# n_subword_const: 153749
# n_total_induced_const: 1177662
# n_total_ref_const: 658562

# | End of training | test loss  3.36 | test ppl    28.90 | test bpc    4.853

In [13]:
print(f"precision: {np.mean(prec_list)}")
print(f"recall: {np.mean(reca_list)}")
print(f"f1: {np.mean(f1_list)}")
print(f"subword recall: {np.mean(subword_reca_list)}")
print(f"n_subword_const: {np.sum(n_subword_const_list)}")
print(f"n_total_induced_const: {np.sum(n_total_induced_const_list)}")
print(f"n_total_ref_const: {np.sum(n_total_ref_const_list)}")

precision: 0.35623490426393867
recall: 0.7058749692624108
f1: 0.4266083854094621
subword recall: 0.780482448866233
n_subword_const: 153749
n_total_induced_const: 1177662
n_total_ref_const: 658562


In [2]:
# SF (sf2)

# | End of training | test loss  3.38 | test ppl    29.40 | test bpc    4.878

# precision: 0.3585300738197754
# recall: 0.7093756827520172
# f1: 0.429312955006772
# subword recall: 0.7687519907001578
# n_subword_const: 153749
# n_total_induced_const: 1177662
# n_total_ref_const: 658562




In [None]:
# SF_IP

# | End of training | test loss  3.28 | test ppl    26.64 | test bpc    4.878

# precision: 0.27150490622138335
# recall: 0.5538122292072093
# f1: 0.3211875279557198
# subword recall: 0.6808046866840887
# n_subword_const: 153749
# n_total_induced_const: 1177663
# n_total_ref_const: 658562


# Scores:
# anaphor_agreement:      80.37%
# argument_structure:     66.66%
# binding:        67.28%
# control_raising:        65.58%
# determiner_noun_agreement:      91.20%
# ellipsis:       78.06%
# filler_gap:     71.43%
# irregular_forms:        92.37%
# island_effects: 49.18%
# npi_licensing:  61.78%
# quantifiers:    69.58%
# subject_verb_agreement: 77.83%
# hypernym:       48.95%
# qa_congruence_easy:     53.12%
# qa_congruence_tricky:   35.76%
# subject_aux_inversion:  79.41%
# turn_taking:    56.07%


In [None]:
# TF (tf)

# Done in paper


# Scores:
# anaphor_agreement:      84.30%
# argument_structure:     67.76%
# binding:        69.23%
# control_raising:        65.33%
# determiner_noun_agreement:      90.57%
# ellipsis:       79.50%
# filler_gap:     71.97%
# irregular_forms:        88.55%
# island_effects: 49.44%
# npi_licensing:  62.72%
# quantifiers:    70.74%
# subject_verb_agreement: 79.10%
# hypernym:       49.19%
# qa_congruence_easy:     57.81%
# qa_congruence_tricky:   26.67%
# subject_aux_inversion:  79.53%
# turn_taking:    60.00%