## Dependency Linearization Playground

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
from codelin.models.linearized_tree import LinearizedTree

deps_treebank = "./ptb-dev.conllx"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=True)

sample_tree = trees[0]
print(sample_tree)
print(D_Tree.to_latex(sample_tree))

<div style="text-align:center"><img src="./pics/notebooks/d_tree_1.png" /></div>

### Encode with 4-bits encoding

In [None]:
enc_4b = D_Brk4BitsEncoding(separator="_")
lin_tree = enc_4b.encode(sample_tree)
print(lin_tree)
dec_tree = enc_4b.decode(lin_tree)
print("LAS =",dec_tree.las_score(sample_tree))

### Test for all trees

In [None]:
enc_4b = D_Brk4BitsEncoding(separator="_")
for i, sample_tree in enumerate(trees):
    lin_tree = enc_4b.encode(sample_tree)
    dec_tree = enc_4b.decode(lin_tree)
    las = dec_tree.las_score(sample_tree)
    
    if las != 1:
        print("Error at tree",i,"length",len(sample_tree))
        print("LAS =",dec_tree.las_score(sample_tree))

### Planar extraction

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import pandas as pd

# read all folders
ud_path="/home/poli/Treebanks/gold/universal_dependencies/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["treebank","n_trees","planarity"])

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1])
    
    # get all conllu files in ud_folder
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
        total_trees += trees
    
    plan_perc = D_Tree.get_planarity_percentage(total_trees)
    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, len(total_trees), plan_perc[0]]], columns=["treebank","n_trees","planarity"])], ignore_index=True)

results_df.sort_values(by=["planarity"], ascending=False)
results_df["treebank"] = results_df["treebank"].apply(lambda x: "${}$".format(x))
print(results_df.sort_values(by=["planarity"], ascending=False).to_latex(index=False))

In [6]:
import json

config_singletask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "label":{
                "task_type":"seq",
                "column_idx":2
            }
        }
    }
}

config_multitask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "brk":{
                "task_type":"seq",
                "column_idx":2
            },
            "reltype":{
                "task_type":"seq",
                "column_idx":3
            }
        }
    }
}

In [7]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os

ag20_path="/home/poli/Treebanks/20ag/"
ag20_folders = [os.path.join(ag20_path, f) for f in os.listdir(ag20_path) if os.path.isdir(os.path.join(ag20_path, f))]
mtl = [True, False]
encoder = D_Brk4BitsEncoding(separator="[_]")

for t in mtl:
    for ag20_folder in ag20_folders:
        print("[INFO] Processing",ag20_folder)
        # get all conllu files
        treebank_name = (ag20_folder.split("/")[-1])
        conllu_files = [os.path.join(ag20_folder, f) for f in os.listdir(ag20_folder) if (f.endswith(".conllu"))]
        
        train_file = None
        dev_file = None

        # encode
        for conllu_file in conllu_files:
            deps_treebank = os.path.join(ag20_folder, conllu_file)
            output_file = os.path.join(ag20_folder, conllu_file)
            target_extension = "_st.labels" if not t else "_mtl.labels"
            output_file = output_file.replace(".conllu", target_extension)  
            
            if "train" in output_file:
                train_file = output_file
            elif "dev" in output_file:
                dev_file = output_file
                
            trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)

            with open(output_file, "w") as f:
                for tree in trees:
                    lin_tree = encoder.encode(tree)
                    f.write(lin_tree.to_string(None, True, t) +"\n")
        
        current_config = config_singletask.copy()
        current_config["dependency"]["train_data_path"] = train_file
        current_config["dependency"]["dev_data_path"] = dev_file

        config_name = "config_st.json" if not t else "config_mtl.json"
        with open(os.path.join(ag20_folder, config_name), "w") as f:
            json.dump(current_config, f, indent=4)

[INFO] Processing /home/poli/Treebanks/20ag/UD_Finnish-TDT
[INFO] Processing /home/poli/Treebanks/20ag/PTB
[INFO] Processing /home/poli/Treebanks/20ag/UD_Wolof-WTB
[INFO] Processing /home/poli/Treebanks/20ag/UD_Ancient_Greek-Perseus
[INFO] Processing /home/poli/Treebanks/20ag/UD_English-EWT
[INFO] Processing /home/poli/Treebanks/20ag/UD_Chinese-GSD
[INFO] Processing /home/poli/Treebanks/20ag/UD_Russian-GSD
[INFO] Processing /home/poli/Treebanks/20ag/UD_Tamil-TTB
[INFO] Processing /home/poli/Treebanks/20ag/UD_German-GSD
[INFO] Processing /home/poli/Treebanks/20ag/UD_Hebrew-HTB
[INFO] Processing /home/poli/Treebanks/20ag/UD_Uyghur-UDT
[INFO] Processing /home/poli/Treebanks/20ag/UD_Finnish-TDT
[INFO] Processing /home/poli/Treebanks/20ag/PTB
[INFO] Processing /home/poli/Treebanks/20ag/UD_Wolof-WTB
[INFO] Processing /home/poli/Treebanks/20ag/UD_Ancient_Greek-Perseus
[INFO] Processing /home/poli/Treebanks/20ag/UD_English-EWT
[INFO] Processing /home/poli/Treebanks/20ag/UD_Chinese-GSD
[INFO] P