## Dependency Linearization Playground

### Dependencies to latex

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

# ptb-dev path
deps_treebank = "/home/droca1/Treebanks/20ag/Penn-Treebank/ptb-dev.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=True)

sample_tree=trees[554]
print(sample_tree)
print(D_Tree.to_latex(sample_tree))

<div style="text-align:center"><img src="./pics/notebooks/d_tree_1.png" /></div>

### Encode with 4-bits encoding

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

# ptb-dev path
deps_treebank = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
enc_7b = D_Brk7BitsEncoding(separator="_")
for i, sample_tree in enumerate(trees):
    lin_tree = enc_7b.encode(sample_tree)
    dec_tree = enc_7b.decode(lin_tree)
    las = dec_tree.las_score(sample_tree)
    
    if las != 1:
        print("Error at tree",i,"length",len(sample_tree))
        print(D_Tree.to_latex(sample_tree))
        print(lin_tree)
        print("LAS =",dec_tree.las_score(sample_tree))

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

deps_treebank = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
sample_tree = trees[6114]
enc_7b = D_Brk7BitsEncoding(separator="[_]")
lin_tree = enc_7b.encode(sample_tree)
print(lin_tree)
dec_tree = enc_7b.decode(lin_tree)
print("LAS =",dec_tree.las_score(sample_tree))

### Planar extraction for all UD Trees

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import pandas as pd

# read all folders
ud_path="/home/droca1/Treebanks/20ag/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["Corpus","n_trees","1-planar","2-planar","3-planar","r_deps","l_deps","avg_dependants"])

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    
    # get all conllu files in ud_folder
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
        total_trees += trees
    
    planar1,planar2,planarN = D_Tree.get_planarity_percentage(total_trees)
    r_deps, l_deps = D_Tree.get_dependency_direction_percentage(total_trees)
    avg_dependants = D_Tree.get_avg_dependants(total_trees)
    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, len(total_trees), str(planar1)+"%", str(planar2)+"%", str(planarN)+"%", r_deps, l_deps, avg_dependants]], 
                                                     columns=["Corpus","n_trees","1-planar","2-planar","3-planar","r_deps","l_deps","avg_dependants"])], ignore_index=True)

print(results_df.to_latex(index=False))

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
ptb_path="/home/droca1/Treebanks/20ag/PENN_TREEBANK/"
ptb_files = [os.path.join(ptb_path, f) for f in os.listdir(ptb_path) if f.endswith(".conllu")]
total_trees = []

for ptb_file in ptb_files:
    trees = D_Tree.read_conllu_file(ptb_file)
    total_trees += trees

for tree in total_trees:
    p1,p2 = D_Tree.two_planar_greedy(tree)
    if len(p2) != 0 and len(p1) != 0:
        print(tree)

In [34]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import copy
import numpy as np
import pandas as pd

# read all folders
ud_path="/home/droca1/Treebanks/20ag/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["Corpus","BRK","BRK-2P","BRK-4B","BRK-7B"])

ebrk   = D_BrkBasedEncoding(separator="[_]",   displacement = True)
ebrk2p = D_Brk2PBasedEncoding(separator="[_]", displacement = True)
ebrk4b = D_Brk4BitsEncoding(separator="[_]")
ebrk7b = D_Brk7BitsEncoding(separator="[_]")

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    injective = {"brk":0, "brk2p":0, "brk4b":0, "brk7b":0}
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, 
                                        filter_projective=False)
        total_trees += trees
        for t in trees:
            t_brk = ebrk.encode(copy.deepcopy(t))
            t_brk.remove_dummy()
            t_brk_dec = ebrk.decode(t_brk)
            injective["brk"] += t_brk_dec.las_score(t)
            
            t_brk2p = ebrk2p.encode(copy.deepcopy(t))
            t_brk2p.remove_dummy()
            t_brk2p_dec = ebrk2p.decode(t_brk2p)
            injective["brk2p"] += t_brk2p_dec.las_score(t)
            
            t_brk4b = ebrk4b.encode(copy.deepcopy(t))
            t_brk4b_dec = ebrk4b.decode(t_brk4b)
            injective["brk4b"] += t_brk4b_dec.las_score(t)
            
            t_brk7b = ebrk7b.encode(copy.deepcopy(t))
            t_brk7b_dec = ebrk7b.decode(t_brk7b)
            injective["brk7b"] += t_brk7b_dec.las_score(t)
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, injective["brk"]/len(total_trees), injective["brk2p"]/len(total_trees), injective["brk4b"]/len(total_trees), injective["brk7b"]/len(total_trees)]],
                                                        columns=["Corpus","BRK","BRK-2P","BRK-4B","BRK-7B"])], ignore_index=True)
    

# Find columns of float type
# float_columns = results_df.select_dtypes(include=[float]).columns
# results_df[float_columns] = results_df[float_columns].applymap('{:.4e}'.format)

print(results_df.to_latex(index=False, float_format="{:0.8}".format))

\begin{tabular}{lrrrr}
\toprule
                  Corpus &        BRK &     BRK-2P &     BRK-4B &     BRK-7B \\
\midrule
          UD-Russian-GSD & 0.99755674 & 0.99997546 & 0.99613951 & 0.99997546 \\
          UD-Finnish-TDT & 0.99717628 & 0.99997437 & 0.99353297 & 0.99997437 \\
           UD-German-GSD & 0.99547451 & 0.99987743 & 0.99275148 & 0.99987743 \\
           PENN-TREEBANK & 0.99999617 &        1.0 & 0.99999232 &        1.0 \\
UD-Ancient-Greek-Perseus & 0.95806205 &  0.9923733 &  0.8893046 &  0.9923733 \\
          UD-Chinese-GSD & 0.99913695 & 0.99999375 & 0.99835233 & 0.99999375 \\
           UD-Hebrew-HTB & 0.99983322 &        1.0 & 0.99978495 &        1.0 \\
            UD-Tamil-TTB &  0.9994489 &        1.0 & 0.99849143 &        1.0 \\
           UD-Uyghur-UDT &  0.9943495 & 0.99997589 & 0.99064191 & 0.99997589 \\
            UD-Wolof-WTB & 0.99828107 & 0.99998305 & 0.99748056 & 0.99998305 \\
          UD-English-EWT & 0.99875497 & 0.99999259 & 0.99808308 & 0.99999259 \\

  print(results_df.to_latex(index=False, float_format="{:0.8}".format))


### Hexatag

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
from codelin.models.const_tree import C_Tree
from nltk.tree import Tree

def pt(t, d=True):
    if d:
        if type(t) is list:
            for i in t:
                print(i)
            for i in t:
                Tree.fromstring(str(i)).pretty_print()
        else:
            Tree.fromstring(str(t)).pretty_print()

path = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-test.conllu"
encoder = D_Brk4BitsEncoding(separator = "[_]")
trees = D_Tree.read_conllu_file(path, filter_projective=True)
sample = trees[0]

print(sample)
bht = D_Tree.to_bht(sample)
print(bht)
dec_tree = D_Tree.from_bht(bht)
print(dec_tree)


### Clean multi-expression lines from CONLLU files

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import re

ag20_path="/home/droca1/Treebanks/20ag/"
ag20_folders = [os.path.join(ag20_path, f) for f in os.listdir(ag20_path) if os.path.isdir(os.path.join(ag20_path, f))]
mtl = [True, False]
encoder = D_Brk4BitsEncoding(separator="[_]")


for ag20_folder in ag20_folders:
    print("[INFO] Processing",ag20_folder)
    # get all conllu files
    treebank_name = (ag20_folder.split("/")[-1])
    conllu_files = [os.path.join(ag20_folder, f) for f in os.listdir(ag20_folder) if (f.endswith(".conllu"))]
    
    train_file = None
    dev_file = None

    # encode
    for conllu_file in conllu_files:
        print("[INFO] Cleaning",conllu_file)
        deps_treebank = os.path.join(ag20_folder, conllu_file)
        output_file = os.path.join(ag20_folder, conllu_file)
        
        with open(deps_treebank, "r") as f:
            lines = f.readlines()
        
        with open(deps_treebank, "w") as f:
            for line in lines:
                if re.match(r"^\d+-.*", line):
                    continue
                f.write(line)

### Encode and generate machamp config for training

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os

import json

config_singletask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "label":{
                "task_type":"seq",
                "column_idx":2
            }
        }
    }
}

config_multitask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "brk":{
                "task_type":"seq",
                "column_idx":2
            },
            "reltype":{
                "task_type":"seq",
                "column_idx":3
            }
        }
    }
}

ag20_path="/home/droca1/Treebanks/20ag/"
ag20_folders = [os.path.join(ag20_path, f) for f in os.listdir(ag20_path) if os.path.isdir(os.path.join(ag20_path, f))]
mtl = [True, False]

#encoder = D_Brk7BitsEncoding(separator="[_]")
encoder = D_Brk2PBasedEncoding(separator="[_]")

filter_projective = False
for ag20_folder in ag20_folders:
    print("[INFO] Processing",ag20_folder)
    # get all conllu files
    treebank_name = (ag20_folder.split("/")[-1])
    conllu_files = [os.path.join(ag20_folder, f) for f in os.listdir(ag20_folder) if (f.endswith(".conllu"))]

    
    train_file = None
    dev_file = None

    # encode
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ag20_folder, conllu_file)
        output_file = os.path.join(ag20_folder, conllu_file)
        target_extension = ".labels"
        output_file = output_file.replace(".conllu", target_extension)  
        
        if "train" in output_file:
            train_file = output_file
        elif "dev" in output_file:
            dev_file = output_file
        
        trees = D_Tree.read_conllu_file(deps_treebank, 
                                        filter_projective=filter_projective)

        with open(output_file, "w") as f:
            for tree in trees:
                lin_tree = encoder.encode(tree)
                f.write(lin_tree.to_string(f_idx_dict=None, 
                                           add_bos_eos=True, 
                                           separate_columns=True) +"\n")
        
        # save a clean test
        if 'test' in conllu_file and filter_projective:
            output_file = output_file.replace(".labels", "-clean.conllu")
            with open(output_file, "w") as f:
                for tree in trees:
                    tree.remove_dummy()
                    f.write("# text = "+tree.get_sentence()+"\n")
                    f.write(str(tree))
    
    current_config = config_multitask.copy()
    current_config["dependency"]["train_data_path"] = train_file.replace('droca1', 'diego.roca')
    current_config["dependency"]["dev_data_path"] = dev_file.replace('droca1', 'diego.roca')

    config_name = "config.json"
    with open(os.path.join(ag20_folder, config_name), "w") as f:
        json.dump(current_config, f, indent=4)