# Extract Sentences to Parse

In [41]:
def extract_sentences_from_gold_standard(file_path):
    original_sentences = []
    gold_trees = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        i = 0
        while i < len(lines):
            line = lines[i].strip()
            if line and line.split(':', 1)[0].strip().replace('a', '').replace('b', '').isdigit():
                sentence = line.split(':', 1)[1].strip()
                original_sentences.append(sentence.lower())
                # The gold tree notation is on the third line after the sentence line
                gold_tree = lines[i+2].strip()
                gold_trees.append(gold_tree)
                i += 3  # Move to the next block of sentence
            else:
                i += 1  # Move to the next line
    return original_sentences, gold_trees

# Specify the path to your gold standard file
file_path = 'L95_10sentencesTaggedAndParsed_goldStandard_corrected.txt'

# Extract the sentences and gold trees
extracted_sentences, gold_trees = extract_sentences_from_gold_standard(file_path)
print(extracted_sentences)
print(gold_trees)


["my aunt's can opener can open a drum.", 'the old car broke down in the car park.', 'at least two men broke in and stole my tv.', 'kim and sandy both broke up with their partners.', 'the horse as well as the rabbits which we wanted to eat has escaped.', "it was my aunt's car which we sold at auction last year in february.", 'natural disasters – storms, flooding, hurricanes – occur infrequently but cause devastation that strains resources to breaking point.', 'letters delivered on time by old-fashioned means are increasingly rare, so it is as well that that is not the only option available.', "english also has many words of more or less unique function, including interjections (oh, ah), negatives (no, not), politeness markers (please, thank you), and the existential 'there' (there are horses but not unicorns) among others.", 'the penn treebank tagset was culled from the original 87-tag tagset for the brown corpus.', 'for example the original brown and c5 tagsets include a separate tag 

#  Charniak-Johnson parser or Brown Reranking Parser
https://aclanthology.org/P05-1022.pdf

In [2]:
from bllipparser import RerankingParser
rrp = RerankingParser.fetch_and_load('WSJ+Gigaword-v2', verbose=False)



In [28]:
rrp.simple_parse("It's that easy.")
# as a tagger:
# rrp.tag("Time flies while you're having fun.")

charniak_trees = [rrp.simple_parse(sentence) for sentence in extracted_sentences]

# Berkeley Neural Parser

In [4]:
import benepar, spacy

benepar.download('benepar_en3')
nlp = spacy.load('en_core_web_md')
if spacy.__version__.startswith('2'):
        nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [29]:
berkeley_trees = []
for sentence in extracted_sentences:
    doc = nlp(sentence)
    sent = list(doc.sents)[0]
    parse_tree = sent._.parse_string
    berkeley_trees.append(parse_tree)

# print(berkeley_trees)



# Visualize trees

In [74]:
from nltk.tree import *
from IPython.display import display, Image, SVG
from cairosvg import svg2png

def printTree(parsed_tree: str, parser_name: str, index: int, saveFile = True):
    nltk_tree = Tree.fromstring(parsed_tree) # convert into NLTK tree

    # nltk_tree.pformat_latex_qtree() # get latex code representation of the tree compatible with the LaTeX qtree package

    # print to console
    # tree.pretty_print(unicodelines=True, nodedist=4) # print to console

    # print as SVG
    svg_format = nltk_tree._repr_svg_()
    # display(SVG(svg_format))

    if saveFile:
        with open(f"ParseTrees/{parser_name}_{index}.svg", 'w') as f:  # open a file in write mode
            f.write(svg_format)  # write the SVG content to the file
        svg2png(bytestring=svg_format,write_to=f"ParseTrees/{parser_name}_{index}.png")
    
    return nltk_tree

def tree_to_forest(tree):
    """
    Convert an NLTK Tree object's string representation into a LaTeX forest package representation.
    Escapes special LaTeX characters.
    """
    # Base case: if the tree is a leaf, just return the escaped leaf
    if isinstance(tree, str):
        return f"[{tree}]"
    
    # Recursively convert each subtree
    escaped_label = tree.label()
    result = '[' + ' '.join([escaped_label] + [tree_to_forest(t) for t in tree]) + ']'
    return result

def toLatexFigures(latex_trees, parser_names, indices):
    with open("parse_trees.tex", 'w') as f:
        for latex_tree, parser_name, index in zip(latex_trees, parser_names, indices):
            f.write('\\thispagestyle{empty}\n')
            f.write('\\begin{center}\n')
            f.write(f'{{\\Large \\textbf{{Model: {parser_name} - Sentence {index+1}}}}}\n\n')  # Header
            f.write('\\vspace*{\\fill}\n')  # Center the tree vertically
            f.write('\\begin{forest}\n')
            f.write(latex_tree + '\n')
            f.write('\\end{forest}\n')
            f.write('\\vspace*{\\fill}\n')  # Center the tree vertically
            f.write('\\end{center}\n')
            f.write('\\newpage\n\n')  # Ensures a new page for the next tree

latex_trees = []
parser_names = []
indices = []

for index, (charniak_tree, berkeley_tree, gold_tree) in enumerate(zip(charniak_trees, berkeley_trees, gold_trees)):
    charniak_tree_nltk = printTree(charniak_tree, "Charniak", index, True)
    berkeley_tree_nltk = printTree(berkeley_tree, "Berkeley", index, True)
    gold_tree_nltk = printTree(gold_tree, "Gold", index, True)

    # Convert the NLTK tree objects to forest package strings
    charniak_forest = tree_to_forest(charniak_tree_nltk)
    berkeley_forest = tree_to_forest(berkeley_tree_nltk)
    gold_forest = tree_to_forest(gold_tree_nltk)

    # Append the LaTeX forest representations to the list
    latex_trees.append(charniak_forest)
    latex_trees.append(berkeley_forest)
    latex_trees.append(gold_forest)

    parser_names.extend(["Charniak", "Berkeley", "Gold"])
    indices.extend([index, index, index])

toLatexFigures(latex_trees, parser_names, indices)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

# Quantitative Evaluation Using EVALB

Adjust output to unify all three results

In [81]:
import re

def adjust_gold_trees(trees):
    adjusted_trees = []
    for tree in trees:
        # Remove the first "(TOP " and the last ")"
        tree = tree[5:-1].strip()

        # Remove all ":<number>" occurrences
        tree = re.sub(r':\d+', '', tree)

        # Replace specific word forms
        replacements = {
            "break+ed": "broke",
            "man+s": "men",
            "Kim": "kim",
            "rabbit+s": "rabbits",
            "be+ed": "was",
            "disaster+s": "disasters",
            "letter+s": "letters",
            "steal+ed": "stole",
            "Sandy": "sandy",
            "want+ed": "wanted",
            "sell+ed": "sold",
            "storm+s": "storms",
            "deliver+ed": "delivered",
            "have+s": "has",
            "Penn": "penn",
            "Brown": "brown",
            "partner+s":"partners",
            "escape+ed":"escaped",
            "February":"february",
            "flood+ing":"flooding",
            "fashion+ed":"fashioned",
            "word+s":"words",
            "Treebank":"treebank",
            "tagset+s":"tagsets",
            "hurricane+s":"hurricanes",
            "be+s":"is",
            "be+":"are",
            "include+ing":"including",
            "cull+ed":"culled",
            "form+s":"forms",
            "strain+s":"strains",
            "interjection+s":"interjections",
            "verb+s":"verbs",
            "resource+s":"resources",
            "not+":"not",
            "negative+s":"negatives",
            "Vdd":"vdd",
            "break+ing":"breaking",
            "marker+s":"markers",
            "do+ed":"did",
            "horse+s":"horses",
            "Vdg":"vdg",
            "unicorn+s":"unicorns",
            "other+s":"others"
        }

        for old, new in replacements.items():
            tree = tree.replace(old, new)

        # Transform cases
        tree = tree.replace("'s+", "'s")
        tree = tree.replace("\\(", "-LRB-")
        tree = tree.replace("\\)", "-RRB-")

        adjusted_trees.append(tree)
    return adjusted_trees

adjusted_gold_trees = adjust_gold_trees(gold_trees)
adjusted_charniak_trees = [tree[4:-1].strip() for tree in charniak_trees]

# write to file
def write_to_file(list_of_trees, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for tree in list_of_trees:
            file.write(tree + '\n')

# Write adjusted trees to files
write_to_file(adjusted_charniak_trees, 'charniak_trees.txt')
write_to_file(berkeley_trees, 'berkeley_trees.txt')
write_to_file(adjusted_gold_trees, 'gold_trees.txt')



## Run Evalb (https://nlp.cs.nyu.edu/evalb/)

In [98]:
import subprocess

def run_evalb(evalb_path, gold_file, test_file, result_file):

    # Command to run EVALB
    command = [evalb_path, '-p', 'EVALB/COLLINS.prm', gold_file, test_file]

    # Run EVALB and write output to result_file
    with open(result_file, 'w') as output_file:
        subprocess.run(command, stdout=output_file, text=True)


for parser_to_eval in ["charniak", "berkeley"]:
    test_file = f"{parser_to_eval}_trees.txt"
    evalb_path = 'EVALB/evalb'  
    result_file_path = f'{test_file.split("_")[0]}_evalb.txt'

    # Running EVALB
    run_evalb(evalb_path, "gold_trees.txt", test_file, result_file_path)

    # Optionally, read and print the results
    with open(result_file_path, 'r') as file:
        results = file.read()
        print(results)

  Sent.                        Matched  Bracket   Cross        Correct Tag
 ID  Len.  Stat. Recal  Prec.  Bracket gold test Bracket Words  Tags Accracy
   1   10    0   62.50  83.33     5      8    6      1      9     8    88.89
   2   10    0   55.56  83.33     5      9    6      0      9     9   100.00
   3   11    0   60.00  75.00     6     10    8      0     10     8    80.00
   4   10    0   45.45  83.33     5     11    6      0      9     8    88.89
   5   15    0   66.67  62.50    10     15   16      0     14    13    92.86
   6   16    0   63.16  75.00    12     19   16      1     15    14    93.33
   7   21    0   57.69  68.18    15     26   22      1     18    13    72.22
   8   27    1    0.00   0.00     0      0    0      0      0     0     0.00
   9   53    1    0.00   0.00     0      0    0      0      0     0     0.00
  10   18    1    0.00   0.00     0      0    0      0      0     0     0.00
  11   40    0   33.33  37.50    12     36   32     12     38    29    76.32
 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
8 : Length unmatch (24|23)
9 : Length unmatch (43|44)
10 : Length unmatch (16|15)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
print(charniak_trees[0])
print(berkeley_trees[0])
print(gold_trees[0])

In [None]:
print("Charniak:", adjusted_charniak_trees[8])
# print(berkeley_trees[6])
print("Gold:", adjusted_gold_trees[8])