# Code to convert consiteuncy parsing from gold standard into Latex format

In [5]:
import re

In [6]:
def to_latex_forest(parse_tree):
    # Function to convert a tree to LaTeX forest format
    def convert_tree(tree):
        tree = tree.strip()
        if tree.startswith("("):
            tree = tree[1:-1].strip()
            space_index = tree.find(' ')
            if space_index == -1:
                return f"[{tree}]"
            node_name = tree[:space_index]
            rest = tree[space_index:].strip()
            children = []
            while rest:
                if rest.startswith("("):
                    depth, i = 1, 1
                    while i < len(rest) and depth > 0:
                        if rest[i] == "(":
                            depth += 1
                        elif rest[i] == ")":
                            depth -= 1
                        i += 1
                    children.append(rest[:i])
                    rest = rest[i:].strip()
                else:
                    space_index = rest.find(' ')
                    if space_index == -1:
                        children.append(rest)
                        rest = ''
                    else:
                        children.append(rest[:space_index])
                        rest = rest[space_index:].strip()
            children_latex = " ".join(convert_tree(child) for child in children)
            return f"[{node_name} {children_latex}]"
        else:
            # Leaf node
            return f"[{tree.split()[0]}]"

    # Remove TOP and the corresponding punctuation
    tree = re.sub(r"^\(\s*TOP\s*|\s*\(\. \.\:\d+\)\s*\)$", "", parse_tree).strip()
    return "\\begin{forest}\nfor tree={s sep=3mm, inner sep=2, l=2mm, font=\\small}\n" + convert_tree(tree) + "\n\\end{forest}"

# Example usage
trees = [
    "(TOP  (S   (NP (PRP$ my:0) (NP (NN aunt:1) (NP (POS 's+:2) (NP (NN can:3) (NN opener:4)))))   (VP (MD can:5) (VP (VB open:6) (NP (DT a:7) (NN drum:8)))))  (. .:9))",
    "(TOP  (S (NP (DT the:0) (NP (ADJP (JJ old:1)) (NN car:2)))   (VP (V (VBD break+ed:3) (RP down:4))    (PP (IN in:5) (NP (DT the:6) (NP (NN car:7) (NN park:8))))))  (. .:9))"
]

for tree in trees:
    latex_tree = to_latex_forest(tree)
    print(latex_tree)


\begin{forest}
for tree={s sep=3mm, inner sep=2, l=2mm, font=\small}
[S [NP [PRP$ [my:0]] [NP [NN [aunt:1]] [NP [POS ['s+:2]] [NP [NN [can:3]] [NN [opener:4]]]]]] [VP [MD [can:5]] [VP [VB [open:6]] [NP [DT [a:7]] [NN [drum:8]]]]]]
\end{forest}
\begin{forest}
for tree={s sep=3mm, inner sep=2, l=2mm, font=\small}
[S [NP [DT [the:0]] [NP [ADJP [JJ [old:1]]] [NN [car:2]]]] [VP [V [VBD [break+ed:3]] [RP [down:4]]] [PP [IN [in:5]] [NP [DT [the:6]] [NP [NN [car:7]] [NN [park:8]]]]]]]
\end{forest}


In [7]:
def to_latex_forest(parse_tree):
    # Function to convert a tree to LaTeX forest format
    def convert_tree(tree, level=0):
        tree = re.sub(r":\d+", "", tree)  # Remove the ":<number>" from the nodes
        tree = tree.strip()
        indent = "  " * level  # Indentation for the current level
        if tree.startswith("("):
            tree = tree[1:-1].strip()
            space_index = tree.find(' ')
            if space_index == -1:
                return f"{indent}[{tree}]"
            node_name = tree[:space_index]
            rest = tree[space_index:].strip()
            children = []
            while rest:
                if rest.startswith("("):
                    depth, i = 1, 1
                    while i < len(rest) and depth > 0:
                        if rest[i] == "(":
                            depth += 1
                        elif rest[i] == ")":
                            depth -= 1
                        i += 1
                    children.append(rest[:i])
                    rest = rest[i:].strip()
                else:
                    space_index = rest.find(' ')
                    if space_index == -1:
                        children.append(rest)
                        rest = ''
                    else:
                        child = rest[:space_index]
                        child = re.sub(r":\d+", "", child)  # Also remove ":<number>" from leaves
                        children.append(child)
                        rest = rest[space_index:].strip()
            children_latex = "\n".join(convert_tree(child, level+1) for child in children)
            return f"{indent}[{node_name}\n{children_latex}\n{indent}]"
        else:
            # Leaf node, remove the POS tag as well
            return f"{indent}[{tree.split()[0].split('|')[0]}]"

    # Remove TOP and the corresponding punctuation
    tree = re.sub(r"^\(\s*TOP\s*|\s*\(\. \.\:\d+\)\s*\)$", "", parse_tree).strip()
    latex_tree = convert_tree(tree)
    
    # Format the output for better readability
    formatted_latex_tree = "\\begin{subfigure}{\\columnwidth}\n\\centering\n\\begin{forest}\nfor tree={s sep=3mm, inner sep=2, l=2mm, font=\\small}\n" + latex_tree + "\n\\end{forest}\n\\caption{Gold Standard Tree}\n\\end{subfigure}"
    
    return formatted_latex_tree

# Example usage
trees = [
    "(TOP  (S   (NP (PRP$ my:0) (NP (NN aunt:1) (NP (POS 's+:2) (NP (NN can:3) (NN opener:4)))))   (VP (MD can:5) (VP (VB open:6) (NP (DT a:7) (NN drum:8)))))  (. .:9))",
    "(TOP  (S (NP (DT the:0) (NP (ADJP (JJ old:1)) (NN car:2)))   (VP (V (VBD break+ed:3) (RP down:4))    (PP (IN in:5) (NP (DT the:6) (NP (NN car:7) (NN park:8))))))  (. .:9))"
]

for tree in trees:
    latex_tree = to_latex_forest(tree)
    print(latex_tree)


\begin{subfigure}{\columnwidth}
\centering
\begin{forest}
for tree={s sep=3mm, inner sep=2, l=2mm, font=\small}
[S
  [NP
    [PRP$
      [my]
    ]
    [NP
      [NN
        [aunt]
      ]
      [NP
        [POS
          ['s+]
        ]
        [NP
          [NN
            [can]
          ]
          [NN
            [opener]
          ]
        ]
      ]
    ]
  ]
  [VP
    [MD
      [can]
    ]
    [VP
      [VB
        [open]
      ]
      [NP
        [DT
          [a]
        ]
        [NN
          [drum]
        ]
      ]
    ]
  ]
]
\end{forest}
\caption{Gold Standard Tree}
\end{subfigure}
\begin{subfigure}{\columnwidth}
\centering
\begin{forest}
for tree={s sep=3mm, inner sep=2, l=2mm, font=\small}
[S
  [NP
    [DT
      [the]
    ]
    [NP
      [ADJP
        [JJ
          [old]
        ]
      ]
      [NN
        [car]
      ]
    ]
  ]
  [VP
    [V
      [VBD
        [break+ed]
      ]
      [RP
        [down]
      ]
    ]
    [PP
      [IN
        [in]
      ]
      [NP
 