## Dependency Linearization Playground

### Dependencies to latex

In [3]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

# ptb-dev path
deps_treebank = "/home/droca1/Treebanks/20ag/Penn-Treebank/ptb-dev.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=True)

sample_tree=trees[554]
print(sample_tree)
print(D_Tree.to_latex(sample_tree))

0	-ROOT-	_	-ROOT-	_	_	0	-NOREL-	_	_
1	Retailer	_	NN	NN	[None]	2	nsubj	_	_
2	Sees	_	VBZ	VBZ	[None]	0	root	_	_
3	Pitfalls	_	NNP	NNP	[None]	2	dobj	_	_
4	In	_	NNP	NNP	[None]	2	prep	_	_
5	Environmental	_	NNP	NNP	[None]	6	amod	_	_
6	Push	_	VB	VB	[None]	4	pobj	_	_


\begin{dependency}[theme = simple]
\begin{deptext}[row sep=.25em, column sep=1.5em]
$i$ \& 0 \& 1 \& 2 \& 3 \& 4 \& 5 \& 6 \\ 
$w_i$ \& -ROOT- \& Retailer \& Sees \& Pitfalls \& In \& Environmental \& Push \\ 
\end{deptext}
\depedge{4}{3}{nsubj}
\depedge{2}{4}{root}
\depedge{4}{5}{dobj}
\depedge{4}{6}{prep}
\depedge{8}{7}{amod}
\depedge{6}{8}{pobj}
\end{dependency}



<div style="text-align:center"><img src="./pics/notebooks/d_tree_1.png" /></div>

### Encode with 4-bits encoding

In [None]:
enc_4b = D_Brk4BitsEncoding(separator="_")
lin_tree = enc_4b.encode(sample_tree)
print(lin_tree)
dec_tree = enc_4b.decode(lin_tree)
print("LAS =",dec_tree.las_score(sample_tree))

In [None]:
enc_4b = D_Brk4BitsEncoding(separator="_")
for i, sample_tree in enumerate(trees):
    lin_tree = enc_4b.encode(sample_tree)
    dec_tree = enc_4b.decode(lin_tree)
    las = dec_tree.las_score(sample_tree)
    
    if las != 1:
        print("Error at tree",i,"length",len(sample_tree))
        print("LAS =",dec_tree.las_score(sample_tree))

### Planar extraction for all UD Trees

In [2]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import pandas as pd

# read all folders
ud_path="/home/droca1/Treebanks/20ag/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["treebank","n_trees","planarity","r_deps","l_deps","avg_dependants"])

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    
    # get all conllu files in ud_folder
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
        total_trees += trees
    
    plan_perc = D_Tree.get_planarity_percentage(total_trees)
    r_deps, l_deps = D_Tree.get_dependency_direction_percentage(total_trees)
    avg_dependants = D_Tree.get_avg_dependants(total_trees)
    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, len(total_trees), plan_perc[0], r_deps, l_deps, avg_dependants]], 
                                                     columns=["treebank","n_trees","planarity","r_deps","l_deps","avg_dependants"])], ignore_index=True)

results_df.sort_values(by=["planarity"], ascending=False)
print(results_df.sort_values(by=["planarity"], ascending=False).to_latex(index=False))

\begin{tabular}{llrrrr}
\toprule
                treebank & n\_trees &  planarity &   r\_deps &   l\_deps &  avg\_dependants \\
\midrule
           PENN-TREEBANK &   43948 &   0.998953 & 0.487397 & 0.512603 &        2.295559 \\
           UD-Hebrew-HTB &    6143 &   0.987140 & 0.463491 & 0.536509 &        2.301833 \\
            UD-Tamil-TTB &     600 &   0.983333 & 0.685628 & 0.314372 &        2.262076 \\
          UD-English-EWT &   16622 &   0.972747 & 0.572834 & 0.427166 &        2.530780 \\
            UD-Wolof-WTB &    2107 &   0.970100 & 0.482173 & 0.517827 &        2.519338 \\
          UD-Chinese-GSD &    4997 &   0.969382 & 0.616777 & 0.383223 &        2.426972 \\
          UD-Russian-GSD &    5030 &   0.938767 & 0.490234 & 0.509766 &        2.263321 \\
          UD-Finnish-TDT &   15136 &   0.938689 & 0.528854 & 0.471146 &        2.365725 \\
           UD-Uyghur-UDT &    3456 &   0.918403 & 0.667189 & 0.332811 &        2.216945 \\
           UD-German-GSD &   15590 &   0.906

  print(results_df.sort_values(by=["planarity"], ascending=False).to_latex(index=False))


### Clean multi-expression lines from CONLLU files

In [5]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import re

ag20_path="/home/droca1/Treebanks/20ag/"
ag20_folders = [os.path.join(ag20_path, f) for f in os.listdir(ag20_path) if os.path.isdir(os.path.join(ag20_path, f))]
mtl = [True, False]
encoder = D_Brk4BitsEncoding(separator="[_]")


for ag20_folder in ag20_folders:
    # remove all -clean files
    os.system("rm {}/*-clean.conllu".format(ag20_folder))

    print("[INFO] Processing",ag20_folder)
    # get all conllu files
    treebank_name = (ag20_folder.split("/")[-1])
    conllu_files = [os.path.join(ag20_folder, f) for f in os.listdir(ag20_folder) if (f.endswith(".conllu"))]
    
    train_file = None
    dev_file = None

    # encode
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ag20_folder, conllu_file)
        output_file = os.path.join(ag20_folder, conllu_file)
        target_extension = "-clean.conllu"
        output_file = output_file.replace(".conllu", target_extension)
        
        with open(deps_treebank, "r") as f:
            lines = f.readlines()
        
        with open(deps_treebank, "w") as f:
            for line in lines:
                if re.match(r"^\d+-.*", line):
                    continue
                f.write(line)

[INFO] Processing /home/droca1/Treebanks/20ag/UD_Russian-GSD
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Finnish-TDT
[INFO] Processing /home/droca1/Treebanks/20ag/UD_German-GSD
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Ancient_Greek-Perseus
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Chinese-GSD
[INFO] Processing /home/droca1/Treebanks/20ag/Penn-Treebank
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Hebrew-HTB
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Tamil-TTB
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Uyghur-UDT
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Wolof-WTB
[INFO] Processing /home/droca1/Treebanks/20ag/UD_English-EWT


### Encode and generate machamp config for training

In [4]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os

import json

config_singletask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "label":{
                "task_type":"seq",
                "column_idx":2
            }
        }
    }
}

config_multitask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "brk":{
                "task_type":"seq",
                "column_idx":2
            },
            "reltype":{
                "task_type":"seq",
                "column_idx":3
            }
        }
    }
}

ag20_path="/home/droca1/Treebanks/20ag/"
ag20_folders = [os.path.join(ag20_path, f) for f in os.listdir(ag20_path)]
mtl = [True, False]

encoder = D_Brk7BitsEncoding(separator="[_]")
encoder = D_Brk2PBasedEncoding(separator="[_]")

for ag20_folder in ag20_folders:
    print("[INFO] Processing",ag20_folder)
    # get all conllu files
    treebank_name = (ag20_folder.split("/")[-1])
    conllu_files = [os.path.join(ag20_folder, f) for f in os.listdir(ag20_folder) if (f.endswith(".conllu"))]
    
    train_file = None
    dev_file = None

    # encode
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ag20_folder, conllu_file)
        output_file = os.path.join(ag20_folder, conllu_file)
        target_extension = ".labels"
        output_file = output_file.replace(".conllu", target_extension)  
        
        if "train" in output_file:
            train_file = output_file
        elif "dev" in output_file:
            dev_file = output_file
            
        trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)

        with open(output_file, "w") as f:
            for tree in trees:
                lin_tree = encoder.encode(tree)
                f.write(lin_tree.to_string(f_idx_dict=None, add_bos_eos=True, separate_columns=True) +"\n")
    
    current_config = config_multitask.copy()
    current_config["dependency"]["train_data_path"] = train_file.replace('droca1', 'diego.roca')
    current_config["dependency"]["dev_data_path"] = dev_file.replace('droca1', 'diego.roca')

    config_name = "config.json"
    with open(os.path.join(ag20_folder, config_name), "w") as f:
        json.dump(current_config, f, indent=4)

[INFO] Processing /home/droca1/Treebanks/20ag/UD_Russian-GSD
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Finnish-TDT
[INFO] Processing /home/droca1/Treebanks/20ag/UD_German-GSD
[INFO] Processing /home/droca1/Treebanks/20ag/PENN_TREEBANK
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Ancient_Greek-Perseus
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Chinese-GSD
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Hebrew-HTB
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Tamil-TTB
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Uyghur-UDT
[INFO] Processing /home/droca1/Treebanks/20ag/UD_Wolof-WTB
[INFO] Processing /home/droca1/Treebanks/20ag/UD_English-EWT
