# Dependency Linearization Playground

## Dependencies to latex

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

# ptb-dev path
deps_treebank = "/home/poli/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
smallest_non_projective = 3992
tree = trees[smallest_non_projective]

encoder = D_Brk7BitsEncoding()
bits = D_Brk7BitsEncoding.labels_to_bits(encoder.encode(tree).labels)
bracket_bits = []
for b in bits:
    b_str = [str(i) for i in b]
    bracket_bits.append("".join(b_str))

print(D_Tree.to_latex(tree, include_col=False, planar_separate=True, planar_colors=['black', 'red'], additional_labels=bracket_bits))

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

# ptb-dev path
deps_treebank = "/home/poli/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=True)

target_tree = 0
n_skips = 2 # just in case the tree is not good
for i,tree in enumerate(trees):
    if len(tree) == 8:
        if n_skips>0:
            n_skips-=1
            continue
        
        target_tree = i
        break

tree = trees[target_tree]

encoder = D_Brk4BitsEncoding()

brackets = [str(i.xi) for i in  encoder.encode(tree).labels]

bits = D_Brk4BitsEncoding.labels_to_bits(encoder.encode(tree).labels)
bracket_bits = []
for b in bits:
    b_str = [str(i) for i in b]
    bracket_bits.append("".join(b_str))

print(D_Tree.to_latex(tree, include_col=False, planar_separate=True, planar_colors=['black', 'red'], additional_labels=bracket_bits))

## Encode with 4-bits encoding

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

# ptb-dev path
deps_treebank = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
enc_7b = D_Brk7BitsEncoding(separator="_")
for i, sample_tree in enumerate(trees):
    lin_tree = enc_7b.encode(sample_tree)
    dec_tree = enc_7b.decode(lin_tree)
    las = dec_tree.las_score(sample_tree)
    
    if las != 1:
        print("Error at tree",i,"length",len(sample_tree))
        print(D_Tree.to_latex(sample_tree))
        print(lin_tree)
        print("LAS =",dec_tree.las_score(sample_tree))

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

deps_treebank = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
sample_tree = trees[6114]
enc_7b = D_Brk7BitsEncoding(separator="[_]")
lin_tree = enc_7b.encode(sample_tree)
print(lin_tree)
dec_tree = enc_7b.decode(lin_tree)
print("LAS =",dec_tree.las_score(sample_tree))

## Planar extraction for all UD Trees

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import pandas as pd

# read all folders
ud_path="/home/poli/Treebanks/d21/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["Corpus","n_trees","1-planar","proj","r_deps","l_deps","avg_dependants"])

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    
    # get all conllu files in ud_folder
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
        total_trees += trees
    
    proj                        = D_Tree.get_projectivity_percentage(total_trees)
    planar1, planar2, planarN   = D_Tree.get_planarity_percentage(total_trees)
    r_deps, l_deps              = D_Tree.get_dependency_direction_percentage(total_trees)
    avg_dependants              = D_Tree.get_avg_dependants(total_trees)
    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, len(total_trees), proj, planar1, r_deps, l_deps, avg_dependants]], columns=results_df.columns)])


print(results_df.to_latex(index=False, float_format="{:0.10}".format))

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
ptb_path="/home/droca1/Treebanks/20ag/PENN_TREEBANK/"
ptb_files = [os.path.join(ptb_path, f) for f in os.listdir(ptb_path) if f.endswith(".conllu")]
total_trees = []

for ptb_file in ptb_files:
    trees = D_Tree.read_conllu_file(ptb_file)
    total_trees += trees

for tree in total_trees:
    p1,p2 = D_Tree.two_planar_greedy(tree)
    if len(p2) != 0 and len(p1) != 0:
        print(tree)

## Label count

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import copy
import numpy as np
import pandas as pd

# read all folders
ud_path="/home/poli/Treebanks/d21/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["Corpus","BRK", "BRK2P", "BRK4B", "BRK7B"])

ebrk   = D_BrkBasedEncoding(separator="[_]",   displacement = True)
ebrk2p = D_Brk2PBasedEncoding(separator="[_]", displacement = True)
ebrk4b = D_Brk4BitsEncoding(separator="[_]")
ebrk7b = D_Brk7BitsEncoding(separator="[_]")

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    total_labels = {"brk":[], "brk2p":[], "brk4b":[], "brk7b":[]}
    
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, 
                                        filter_projective=False)
        total_trees += trees
        
        for t in trees:
            t_brk = ebrk.encode(copy.deepcopy(t))
            total_labels["brk"] += [str(lbl.xi) for lbl in t_brk.labels]
            t_brk7b = ebrk7b.encode(copy.deepcopy(t))
            total_labels["brk7b"] += [str(lbl.xi) for lbl in t_brk7b.labels]
            t_brk_2p = ebrk2p.encode(copy.deepcopy(t))
            total_labels["brk2p"] += [str(lbl.xi) for lbl in t_brk_2p.labels]
            t_brk4b = ebrk4b.encode(copy.deepcopy(t))
            total_labels["brk4b"] += [str(lbl.xi) for lbl in t_brk4b.labels]

    
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    
    total_labels["brk"] =   set(total_labels["brk"])
    total_labels["brk2p"] = set(total_labels["brk2p"])
    total_labels["brk4b"] = set(total_labels["brk4b"])
    total_labels["brk7b"] = set(total_labels["brk7b"])
    
    # remove none
    print(total_labels["brk4b"])
    if "NONE" in total_labels["brk"]:
        total_labels["brk"].remove("-NONE-")
    if "-NONE-" in total_labels["brk2p"]:
        total_labels["brk2p"].remove("-NONE-")
    if "-NONE-" in total_labels["brk4b"]:
        total_labels["brk4b"].remove("-NONE-")
    if "-NONE-" in total_labels["brk7b"]:
        total_labels["brk7b"].remove("-NONE-")
    
    total_labels["brk"]   = len(total_labels["brk"])
    total_labels["brk2p"] = len(total_labels["brk2p"])
    total_labels["brk4b"] = len(total_labels["brk4b"])
    total_labels["brk7b"] = len(total_labels["brk7b"])

    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, total_labels["brk"], total_labels["brk2p"], total_labels["brk4b"], total_labels["brk7b"]]],
                                                        columns=["Corpus","BRK", "BRK2P", "BRK4B", "BRK7B"])], ignore_index=True)

print(results_df.to_latex(index=False, float_format="{:0.8}".format))

## Coverage

### Number of dependency arcs 

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import copy
import numpy as np
import pandas as pd

# read all folders
ud_path="/home/poli/Treebanks/d21/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["Corpus","BRK", "BRK-2P","BRK-4B","BRK-7B"])

ebrk   = D_BrkBasedEncoding(separator="[_]",   displacement = False)
ebrk2p = D_Brk2PBasedEncoding(separator="[_]", displacement = True)
ebrk4b = D_Brk4BitsEncoding(separator="[_]")
ebrk7b = D_Brk7BitsEncoding(separator="[_]")

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    injective = {"brk":0.0, "brk2p":0.0, "brk4b":0.0, "brk7b":0.0}
    
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, 
                                        filter_projective=False)
        total_trees += trees
        for t in trees:
            t_brk = ebrk.encode(copy.deepcopy(t))
            t_brk.remove_dummy()
            t_brk_dec = ebrk.decode(t_brk)
            injective["brk"] += t_brk_dec.las_score(t)
            
            t_brk2p = ebrk2p.encode(copy.deepcopy(t))
            t_brk2p.remove_dummy()
            t_brk2p_dec = ebrk2p.decode(t_brk2p)
            injective["brk2p"] += t_brk2p_dec.las_score(t)
            
            t_brk4b = ebrk4b.encode(copy.deepcopy(t))
            t_brk4b_dec = ebrk4b.decode(t_brk4b)
            injective["brk4b"] += t_brk4b_dec.las_score(t)
            
            t_brk7b = ebrk7b.encode(copy.deepcopy(t))
            t_brk7b_dec = ebrk7b.decode(t_brk7b)
            injective["brk7b"] += t_brk7b_dec.las_score(t)
    
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, injective["brk"]/len(total_trees), injective["brk2p"]/len(total_trees), injective["brk4b"]/len(total_trees), injective["brk7b"]/len(total_trees)]],
                                                        columns=["Corpus","BRK","BRK-2P","BRK-4B","BRK-7B"])], ignore_index=True)

print(results_df.to_latex(index=False, float_format="{:0.8}".format))

### Number of Trees

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import copy
import numpy as np
import pandas as pd

# read all folders
ud_path="/home/poli/Treebanks/d21/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["Corpus","BRK","BRK-2P","BRK-4B","BRK-7B"])

ebrk   = D_BrkBasedEncoding(separator="[_]",   displacement = False)
ebrk2p = D_Brk2PBasedEncoding(separator="[_]", displacement = False)
ebrk4b = D_Brk4BitsEncoding(separator="[_]")
ebrk7b = D_Brk7BitsEncoding(separator="[_]")

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    injective = {"brk":0.0, "brk2p":0.0, "brk4b":0.0, "brk7b":0.0}
    
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank,filter_projective = False)
        total_trees += trees
        for t in trees:
            t_brk = ebrk.encode(copy.deepcopy(t))
            t_brk.remove_dummy()
            t_brk_dec = ebrk.decode(t_brk)
            injective["brk"] += 1 if t_brk_dec.las_score(t)==1 else 0
            
            t_brk2p = ebrk2p.encode(copy.deepcopy(t))
            t_brk2p.remove_dummy()
            t_brk2p_dec = ebrk2p.decode(t_brk2p)
            injective["brk2p"] += 1 if t_brk2p_dec.las_score(t)==1 else 0
            
            t_brk4b = ebrk4b.encode(copy.deepcopy(t))
            t_brk4b_dec = ebrk4b.decode(t_brk4b)
            injective["brk4b"] += 1 if t_brk4b_dec.las_score(t)==1 else 0
            
            t_brk7b = ebrk7b.encode(copy.deepcopy(t))
            t_brk7b_dec = ebrk7b.decode(t_brk7b)
            injective["brk7b"] += 1 if t_brk7b_dec.las_score(t)==1 else 0
    
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, injective["brk"]/len(total_trees), injective["brk2p"]/len(total_trees), injective["brk4b"]/len(total_trees), injective["brk7b"]/len(total_trees)]],
                                                        columns=["Corpus","BRK","BRK-2P","BRK-4B","BRK-7B"])], ignore_index=True)

print(results_df.to_latex(index=False, float_format="{:0.5}".format))

### Different encodings coverage

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import copy
import numpy as np
import pandas as pd

# conllu_file = "/home/poli/Treebanks/20ag/PTB/ptb-train.conllu"
conllu_file = "/home/poli/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"

ebrk   = D_BrkBasedEncoding(separator="[_]", displacement = False)
ebrk4b = D_Brk4BitsEncoding(separator="[_]")

total_trees = []
trees = D_Tree.read_conllu_file(conllu_file, filter_projective=False)

for i,t in enumerate(trees):
    t_brk = ebrk.encode(copy.deepcopy(t))
    t_brk.remove_dummy()
    t_brk_dec = ebrk.decode(t_brk)
    
    t_brk_4b = ebrk4b.encode(copy.deepcopy(t))
    t_brk_4b_dec = ebrk4b.decode(t_brk_4b)
    
    if t.las_score(t_brk_dec) == 1 and t.las_score(t_brk_4b_dec) != 1:
        print("Tree with BRK but not with BRK-4B: ",i)
        t.remove_dummy()

## Hexatag

1) Transform the dependency trees into binary head consituent trees
2) Transform the BHT into hexatags shaped as [<arrow>_<reltype>] where arrow is the corresponding hexatag arrow and reltype is the relationship type for the word whose index is being parsed
3) Implement decoding operation

In [1]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
from codelin.models.const_tree import C_Tree
from nltk.tree import Tree

def pt(t, d=True):
    if d:
        if type(t) is list:
            for i in t:
                print(i)
            for i in t:
                Tree.fromstring(str(i)).pretty_print()
        else:
            Tree.fromstring(str(t)).pretty_print()

path = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-test.conllu"
encoder = D_Brk4BitsEncoding(separator = "[_]")
trees = D_Tree.read_conllu_file(path, filter_projective=True)
sample = trees[0]

print(sample)
bht = D_Tree.to_bht(sample)
#print(bht)
#Tree.fromstring(str(bht)).pretty_print()
dec_tree = D_Tree.from_bht(bht)
print(dec_tree)


0	-ROOT-	_	-ROOT-	_	_	0	-NOREL-	_	_
1	What	what	PRON	WP	PronType=Int	0	root	0:root	_
2	if	if	SCONJ	IN	_	4	mark	4:mark	_
3	Google	Google	PROPN	NNP	Number=Sing	4	nsubj	4:nsubj	_
4	Morphed	morph	VERB	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	1	advcl	1:advcl:if	_
5	Into	into	ADP	IN	_	6	case	6:case	_
6	GoogleOS	GoogleOS	PROPN	NNP	Number=Sing	4	obl	4:obl:into	SpaceAfter=No
7	?	?	PUNCT	.	_	4	punct	4:punct	_


1	-NONE-	_	_	_	_	0	-NONE-	_	_
2	-NONE-	_	_	_	_	4	-NONE-	_	_
3	-NONE-	_	_	_	_	4	-NONE-	_	_
4	-NONE-	_	_	_	_	1	-NONE-	_	_
5	-NONE-	_	_	_	_	6	-NONE-	_	_
6	-NONE-	_	_	_	_	4	-NONE-	_	_
7	-NONE-	_	_	_	_	4	-NONE-	_	_




## Encode and generate machamp config for training

Clean multi-expression lines

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import re

treebank_path="/home/droca1/Treebanks/UD_Spanish-AnCora"
treebank_folders = [os.path.join(treebank_path, f) for f in os.listdir(treebank_path) if os.path.isdir(os.path.join(treebank_path, f))]
mtl = [True, False]
encoder = D_Brk4BitsEncoding(separator="[_]")


for tb_f in treebank_folders:
    print("[INFO] Processing",tb_f)
    # get all conllu files
    treebank_name = (tb_f.split("/")[-1])
    conllu_files = [os.path.join(tb_f, f) for f in os.listdir(tb_f) if (f.endswith(".conllu") and 'test' in f)]
    
    train_file = None
    dev_file = None

    # encode
    for conllu_file in conllu_files:
        print("[INFO] Cleaning",conllu_file)
        deps_treebank = os.path.join(tb_f, conllu_file)
        output_file = os.path.join(tb_f, conllu_file)
        
        with open(deps_treebank, "r") as f:
            lines = f.readlines()
        
        with open(deps_treebank, "w") as f:
            for line in lines:
                if re.match(r"^\d+-.*", line):
                    continue
                f.write(line)

Perform 80% split of treebanks without dev set

In [None]:
# perform holdout of 20%
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
treebank_path = "/home/poli/Treebanks/d21/UD_Galician-TreeGal/gl_treegal-ud-train.backup"
trees = D_Tree.read_conllu_file(treebank_path, filter_projective=False)

trees_train = trees[:int(len(trees)*0.8)]
trees_dev = trees[int(len(trees)*0.8):]

with open("/home/poli/Treebanks/d21/UD_Galician-TreeGal/gl_treegal-ud-train.conllu", "w") as f:
        for t in trees_train:
                t.remove_dummy()
                f.write(str(t))
with open("/home/poli/Treebanks/d21/UD_Galician-TreeGal/gl_treegal-ud-dev.conllu", "w") as f:
        for t in trees_dev:
                t.remove_dummy()
                f.write(str(t))


Encode and generate machamp configuration

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os

import json

config_singletask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "label":{
                "task_type":"seq",
                "column_idx":2
            }
        }
    }
}

config_multitask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "brk":{
                "task_type":"seq",
                "column_idx":2
            },
            "reltype":{
                "task_type":"seq",
                "column_idx":3
            }
        }
    }
}

treebank_path = "/home/poli/Treebanks/d21/"
treebank_folders = [os.path.join(treebank_path, f) for f in os.listdir(treebank_path) if os.path.isdir(os.path.join(treebank_path, f))]
mtl = [True, False]

brk_bs = D_BrkBasedEncoding(separator="[_]", displacement=False)
brk_2p = D_Brk2PBasedEncoding(separator="[_]", displacement=False)
brk_4b = D_Brk4BitsEncoding(separator="[_]")
brk_7b = D_Brk7BitsEncoding(separator="[_]")

encodings = [brk_bs, brk_2p, brk_4b, brk_7b]
filter_projective = False

for encoder in encodings:
    print("[INFO] Encoding with", encoder.__class__.__name__)

    for tb_f in treebank_folders:
        print("[INFO] Processing",tb_f)
        # get all conllu files
        treebank_name = (tb_f.split("/")[-1])
        conllu_files = [os.path.join(tb_f, f) for f in os.listdir(tb_f) if (f.endswith(".conllu"))]
        
        train_file = ""
        dev_file = ""

        # encode
        for conllu_file in conllu_files:
            deps_treebank = os.path.join(tb_f, conllu_file)
            output_file = os.path.join(tb_f, conllu_file)
            
            target_extension = "_"+encoder.__class__.__name__+".labels"
            output_file = output_file.replace(".conllu", target_extension)  
            
            if "train" in output_file:
                train_file = output_file
            elif "dev" in output_file:
                dev_file = output_file
            
            trees = D_Tree.read_conllu_file(deps_treebank, 
                                            filter_projective=filter_projective)

            with open(output_file, "w") as f:
                for tree in trees:
                    lin_tree = encoder.encode(tree)
                    f.write(lin_tree.to_string(f_idx_dict=None, 
                                            add_bos_eos=True, 
                                            separate_columns=True) +"\n")
            
            # save a clean test
            if 'test' in conllu_file and filter_projective:
                output_file = output_file.replace(".labels", "-clean.conllu")
                with open(output_file, "w") as f:
                    for tree in trees:
                        tree.remove_dummy()
                        f.write("# text = "+tree.get_sentence()+"\n")
                        f.write(str(tree))
        
        current_config = config_multitask.copy()
        current_config["dependency"]["train_data_path"] = train_file.replace('poli', 'diego.roca')
        current_config["dependency"]["dev_data_path"] = dev_file.replace('poli', 'diego.roca')

        config_name = "config_"+encoder.__class__.__name__+".json"
        with open(os.path.join(tb_f, config_name), "w") as f:
            json.dump(current_config, f, indent=4)