## Dependency Linearization Playground

### Dependencies to latex

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

# ptb-dev path
deps_treebank = "/home/droca1/Treebanks/20ag/Penn-Treebank/ptb-dev.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=True)

sample_tree=trees[554]
print(sample_tree)
print(D_Tree.to_latex(sample_tree))

<div style="text-align:center"><img src="./pics/notebooks/d_tree_1.png" /></div>

### Encode with 4-bits encoding

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

# ptb-dev path
deps_treebank = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
enc_7b = D_Brk7BitsEncoding(separator="_")
for i, sample_tree in enumerate(trees):
    lin_tree = enc_7b.encode(sample_tree)
    dec_tree = enc_7b.decode(lin_tree)
    las = dec_tree.las_score(sample_tree)
    
    if las != 1:
        print("Error at tree",i,"length",len(sample_tree))
        print(D_Tree.to_latex(sample_tree))
        print(lin_tree)
        print("LAS =",dec_tree.las_score(sample_tree))

In [1]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

deps_treebank = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
sample_tree = trees[6114]
enc_7b = D_Brk7BitsEncoding(separator="[_]")
lin_tree = enc_7b.encode(sample_tree)
print(lin_tree)
dec_tree = enc_7b.decode(lin_tree)
print("LAS =",dec_tree.las_score(sample_tree))

updating head CONTACT with /0 from /0 and  to /0
updating head William with /0 from /0 and >0* to >0*/0
updating head McGinnis with /0 from /0 and >0* to >0*/0
updating head ( with \0 from \0 and  to \0
updating head Rev. with  from /0 and >0*/0 to >0*/0
updating head Bill with /0 from /0 and \0>0 to \0>0/0
updating head McGinnis with  from /0 and \0>0/0 to \0>0/0
updating head ) with  from /0 and \0>0/0 to \0>0/0
updating head Mt with \0 from \0 and  to \0
updating head Vernon with \0 from \0 and  to \0
updating head 1908 with  from \0 and \0 to \0
updating head Ave with  from /0 and >0*/0 to >0*/0
updating head # with \0 from \0 and  to \0
updating head Alexandria with /0 from /0 and \0>0 to \0>0/0
updating head , with /0 from /0 and >0 to >0/0
updating head VA with  from /0 and >0/0 to >0/0
updating head 22301 with  from /0 and \0>0/0 to \0>0/0
updating head 7037686710 with  from /0 and >0*/0 to >0*/0
updating head 2543 with /1 from /1 and <0* to <0*/1
-BOS-	-BOS-	-BOS-
-ROOT-	-ROOT

### Planar extraction for all UD Trees

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import pandas as pd

# read all folders
ud_path="/home/droca1/Treebanks/20ag/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["treebank","n_trees","planarity","r_deps","l_deps","avg_dependants"])

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    
    # get all conllu files in ud_folder
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
        total_trees += trees
    
    plan_perc = D_Tree.get_planarity_percentage(total_trees)
    r_deps, l_deps = D_Tree.get_dependency_direction_percentage(total_trees)
    avg_dependants = D_Tree.get_avg_dependants(total_trees)
    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, len(total_trees), plan_perc[0], r_deps, l_deps, avg_dependants]], 
                                                     columns=["treebank","n_trees","planarity","r_deps","l_deps","avg_dependants"])], ignore_index=True)

results_df.sort_values(by=["planarity"], ascending=False)
print(results_df.sort_values(by=["planarity"], ascending=False).to_latex(index=False))

### Hexatag

In [2]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
from codelin.models.const_tree import C_Tree
from nltk.tree import Tree

def pt(t, d=True):
    if d:
        if type(t) is list:
            for i in t:
                print(i)
            for i in t:
                Tree.fromstring(str(i)).pretty_print()
        else:
            Tree.fromstring(str(t)).pretty_print()

path = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-test.conllu"
encoder = D_Brk4BitsEncoding(separator = "[_]")
trees = D_Tree.read_conllu_file(path, filter_projective=True)
sample = trees[0]

print(D_Tree.to_latex(sample))
bht = D_Tree.to_bht(sample)

#print(C_Tree.to_latex(bht))
pt(bht)


\begin{dependency}[theme = simple]
\begin{deptext}[row sep=.25em, column sep=1.5em]
$i$ \& 0 \& 1 \& 2 \& 3 \& 4 \& 5 \& 6 \& 7 \\ 
$w_i$ \& -ROOT- \& What \& if \& Google \& Morphed \& Into \& GoogleOS \& ? \\ 
\end{deptext}
\depedge{2}{3}{root}
\depedge{6}{4}{mark}
\depedge{6}{5}{nsubj}
\depedge{3}{6}{advcl}
\depedge{8}{7}{case}
\depedge{6}{8}{obl}
\depedge{6}{9}{punct}
\end{dependency}

        L                                               
   _____|____                                            
  |          L                                          
  |      ____|______________                             
  |     |                   L                           
  |     |     ______________|_______                     
  |     |    |                      L                   
  |     |    |         _____________|_____               
  |     |    |        |                   R             
  |     |    |        |              _____|___           
  |     |    |        R         

### Clean multi-expression lines from CONLLU files

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import re

ag20_path="/home/droca1/Treebanks/20ag/"
ag20_folders = [os.path.join(ag20_path, f) for f in os.listdir(ag20_path) if os.path.isdir(os.path.join(ag20_path, f))]
mtl = [True, False]
encoder = D_Brk4BitsEncoding(separator="[_]")


for ag20_folder in ag20_folders:
    # remove all -clean files
    os.system("rm {}/*-clean.conllu".format(ag20_folder))

    print("[INFO] Processing",ag20_folder)
    # get all conllu files
    treebank_name = (ag20_folder.split("/")[-1])
    conllu_files = [os.path.join(ag20_folder, f) for f in os.listdir(ag20_folder) if (f.endswith(".conllu"))]
    
    train_file = None
    dev_file = None

    # encode
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ag20_folder, conllu_file)
        output_file = os.path.join(ag20_folder, conllu_file)
        target_extension = "-clean.conllu"
        output_file = output_file.replace(".conllu", target_extension)
        
        with open(deps_treebank, "r") as f:
            lines = f.readlines()
        
        with open(deps_treebank, "w") as f:
            for line in lines:
                if re.match(r"^\d+-.*", line):
                    continue
                f.write(line)

### Encode and generate machamp config for training

In [2]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os

import json

config_singletask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "label":{
                "task_type":"seq",
                "column_idx":2
            }
        }
    }
}

config_multitask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "brk":{
                "task_type":"seq",
                "column_idx":2
            },
            "reltype":{
                "task_type":"seq",
                "column_idx":3
            }
        }
    }
}

ag20_path="/home/droca1/Treebanks/20ag/"
ag20_folders = [os.path.join(ag20_path, f) for f in os.listdir(ag20_path)]
mtl = [True, False]

encoder = D_Brk7BitsEncoding(separator="[_]")
encoder = D_Brk2PBasedEncoding(separator="[_]")

for ag20_folder in ag20_folders:
    print("[INFO] Processing",ag20_folder)
    # get all conllu files
    treebank_name = (ag20_folder.split("/")[-1])
    conllu_files = [os.path.join(ag20_folder, f) for f in os.listdir(ag20_folder) if (f.endswith(".conllu"))]
    
    train_file = None
    dev_file = None

    # encode
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ag20_folder, conllu_file)
        output_file = os.path.join(ag20_folder, conllu_file)
        target_extension = ".labels"
        output_file = output_file.replace(".conllu", target_extension)  
        
        if "train" in output_file:
            train_file = output_file
        elif "dev" in output_file:
            dev_file = output_file
            
        trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=True)

        with open(output_file, "w") as f:
            for tree in trees:
                lin_tree = encoder.encode(tree)
                f.write(lin_tree.to_string(f_idx_dict=None, add_bos_eos=True, separate_columns=True) +"\n")
    
    current_config = config_multitask.copy()
    current_config["dependency"]["train_data_path"] = train_file.replace('droca1', 'diego.roca')
    current_config["dependency"]["dev_data_path"] = dev_file.replace('droca1', 'diego.roca')

    config_name = "config.json"
    with open(os.path.join(ag20_folder, config_name), "w") as f:
        json.dump(current_config, f, indent=4)

[INFO] Processing /home/droca1/Treebanks/20ag/UD_Russian-GSD
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Finnish-TDT
[INFO] Processing /home/droca1/Treebanks/20ag/UD_German-GSD
[INFO] Processing /home/droca1/Treebanks/20ag/PENN_TREEBANK
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Ancient_Greek-Perseus
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Chinese-GSD
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Hebrew-HTB
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Tamil-TTB
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Uyghur-UDT
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Wolof-WTB
[INFO] Processing /home/droca1/Treebanks/20ag/UD_English-EWT
