## Dependency Linearization Playground

### Dependencies to latex

In [2]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

# ptb-dev path
deps_treebank = "/home/poli/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
smallest_non_projective = 3992
tree = trees[smallest_non_projective]

encoder = D_Brk7BitsEncoding()
bits = D_Brk7BitsEncoding.labels_to_bits(encoder.encode(tree).labels)
bracket_bits = []
for b in bits:
    b_str = [str(i) for i in b]
    bracket_bits.append("".join(b_str))

print(D_Tree.to_latex(tree, include_col=False, planar_separate=True, planar_colors=['black', 'red'], additional_labels=bracket_bits))

\begin{dependency}[theme = simple]
\begin{deptext}[row sep=.25em, column sep=1.5em]
0 \& 1 \& 2 \& 3 \& 4 \& 5 \& 6 \& 7 \\ 
-ROOT- \& What \& do \& I \& need \& to \& do \& ? \\ 
\texttt{0000100} \& \texttt{0110000} \& \texttt{0010000} \& \texttt{0000000} \& \texttt{1011100} \& \texttt{0010000} \& \texttt{1001010} \& \texttt{1010000} \\ 
\end{deptext}
\depedge[edge style={red}]{7}{2}{obj}
\depedge[edge style={black}]{5}{3}{aux}
\depedge[edge style={black}]{5}{4}{nsubj}
\depedge[edge style={black}]{1}{5}{root}
\depedge[edge style={black}]{7}{6}{mark}
\depedge[edge style={black}]{5}{7}{xcomp}
\depedge[edge style={black}]{5}{8}{punct}
\end{dependency}



In [3]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

# ptb-dev path
deps_treebank = "/home/poli/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=True)

target_tree = 0
n_skips = 2 # just in case the tree is not good
for i,tree in enumerate(trees):
    if len(tree) == 8:
        if n_skips>0:
            n_skips-=1
            continue
        
        target_tree = i
        break

tree = trees[target_tree]

encoder = D_Brk4BitsEncoding()

brackets = [str(i.xi) for i in  encoder.encode(tree).labels]

bits = D_Brk4BitsEncoding.labels_to_bits(encoder.encode(tree).labels)
bracket_bits = []
for b in bits:
    b_str = [str(i) for i in b]
    bracket_bits.append("".join(b_str))

print(D_Tree.to_latex(tree, include_col=False, planar_separate=True, planar_colors=['black', 'red'], additional_labels=bracket_bits))

\begin{dependency}[theme = simple]
\begin{deptext}[row sep=.25em, column sep=1.5em]
0 \& 1 \& 2 \& 3 \& 4 \& 5 \& 6 \& 7 \\ 
-ROOT- \& It \& should \& continue \& to \& be \& defanged \& . \\ 
\texttt{0001} \& \texttt{0100} \& \texttt{0000} \& \texttt{1111} \& \texttt{0100} \& \texttt{0000} \& \texttt{1010} \& \texttt{1100} \\ 
\end{deptext}
\depedge[edge style={black}]{4}{2}{nsubj}
\depedge[edge style={black}]{4}{3}{aux}
\depedge[edge style={black}]{1}{4}{root}
\depedge[edge style={black}]{7}{5}{mark}
\depedge[edge style={black}]{7}{6}{aux:pass}
\depedge[edge style={black}]{4}{7}{xcomp}
\depedge[edge style={black}]{4}{8}{punct}
\end{dependency}



### Encode with 4-bits encoding

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

# ptb-dev path
deps_treebank = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
enc_7b = D_Brk7BitsEncoding(separator="_")
for i, sample_tree in enumerate(trees):
    lin_tree = enc_7b.encode(sample_tree)
    dec_tree = enc_7b.decode(lin_tree)
    las = dec_tree.las_score(sample_tree)
    
    if las != 1:
        print("Error at tree",i,"length",len(sample_tree))
        print(D_Tree.to_latex(sample_tree))
        print(lin_tree)
        print("LAS =",dec_tree.las_score(sample_tree))

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree

deps_treebank = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
sample_tree = trees[6114]
enc_7b = D_Brk7BitsEncoding(separator="[_]")
lin_tree = enc_7b.encode(sample_tree)
print(lin_tree)
dec_tree = enc_7b.decode(lin_tree)
print("LAS =",dec_tree.las_score(sample_tree))

### Planar extraction for all UD Trees

In [5]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import pandas as pd

# read all folders
ud_path="/home/poli/Treebanks/d21/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["Corpus","n_trees","1-planar","r_deps","l_deps","avg_dependants"])

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    
    # get all conllu files in ud_folder
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=False)
        total_trees += trees
    
    planar1,planar2,planarN = D_Tree.get_planarity_percentage(total_trees)
    r_deps, l_deps = D_Tree.get_dependency_direction_percentage(total_trees)
    avg_dependants = D_Tree.get_avg_dependants(total_trees)
    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, len(total_trees), str(planar1)+"%", r_deps, l_deps, avg_dependants]], 
                                                     columns=["Corpus","n_trees","1-planar","r_deps","l_deps","avg_dependants"])], ignore_index=True)

print(results_df.to_latex(index=False))

\begin{tabular}{lllrrr}
\toprule
Corpus & n_trees & 1-planar & r_deps & l_deps & avg_dependants \\
\midrule
UD-Galician-TreeGal & 1000 & 0.888% & 0.530257 & 0.469743 & 2.530101 \\
UD-Lithuanian-HSE & 263 & 0.8593155893536122% & 0.584018 & 0.415982 & 2.321700 \\
UD-Belarusian-HSE & 25231 & 0.9492291229043637% & 0.469295 & 0.530705 & 2.232214 \\
UD-Old-East-Slavic-RNC & 1070 & 0.6626168224299065% & 0.582177 & 0.417823 & 2.433108 \\
UD-Marathi-UFAL & 466 & 0.9592274678111588% & 0.508184 & 0.491816 & 2.362304 \\
UD-Welsh-CCG & 2338 & 0.9824636441402909% & 0.439404 & 0.560596 & 2.324992 \\
\bottomrule
\end{tabular}



In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
ptb_path="/home/droca1/Treebanks/20ag/PENN_TREEBANK/"
ptb_files = [os.path.join(ptb_path, f) for f in os.listdir(ptb_path) if f.endswith(".conllu")]
total_trees = []

for ptb_file in ptb_files:
    trees = D_Tree.read_conllu_file(ptb_file)
    total_trees += trees

for tree in total_trees:
    p1,p2 = D_Tree.two_planar_greedy(tree)
    if len(p2) != 0 and len(p1) != 0:
        print(tree)

### Label count

In [4]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import copy
import numpy as np
import pandas as pd

# read all folders
ud_path="/home/poli/Treebanks/d21/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["Corpus","BRK", "BRK2P", "BRK4B", "BRK7B"])

ebrk   = D_BrkBasedEncoding(separator="[_]",   displacement = True)
ebrk2p = D_Brk2PBasedEncoding(separator="[_]", displacement = True)
ebrk4b = D_Brk4BitsEncoding(separator="[_]")
ebrk7b = D_Brk7BitsEncoding(separator="[_]")

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    total_labels = {"brk":[], "brk2p":[], "brk4b":[], "brk7b":[]}
    
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, 
                                        filter_projective=False)
        total_trees += trees
        
        for t in trees:
            t_brk = ebrk.encode(copy.deepcopy(t))
            total_labels["brk"] += [str(lbl) for lbl in t_brk.labels]
            t_brk7b = ebrk7b.encode(copy.deepcopy(t))
            total_labels["brk7b"] += [str(lbl) for lbl in t_brk7b.labels]
            t_brk_2p = ebrk2p.encode(copy.deepcopy(t))
            total_labels["brk2p"] += [str(lbl) for lbl in t_brk_2p.labels]
            t_brk4b = ebrk4b.encode(copy.deepcopy(t))
            total_labels["brk4b"] += [str(lbl) for lbl in t_brk4b.labels]

    
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    
    total_labels["brk"] = set(total_labels["brk"])
    total_labels["brk2p"] = set(total_labels["brk2p"])
    total_labels["brk4b"] = set(total_labels["brk4b"])
    total_labels["brk7b"] = set(total_labels["brk7b"])
    
    # remove none
    if "-NONE-" in total_labels["brk"]:
        total_labels["brk"].remove("-NONE-")
    if "-NONE-" in total_labels["brk2p"]:
        total_labels["brk2p"].remove("-NONE-")
    if "-NONE-" in total_labels["brk4b"]:
        total_labels["brk4b"].remove("-NONE-")
    if "-NONE-" in total_labels["brk7b"]:
        total_labels["brk7b"].remove("-NONE-")
    
    total_labels["brk"] = len(total_labels["brk"])
    total_labels["brk2p"] = len(total_labels["brk2p"])
    total_labels["brk4b"] = len(total_labels["brk4b"])
    total_labels["brk7b"] = len(total_labels["brk7b"])

    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, total_labels["brk"], total_labels["brk2p"], total_labels["brk4b"], total_labels["brk7b"]]],
                                                        columns=["Corpus","BRK", "BRK2P", "BRK4B", "BRK7B"])], ignore_index=True)

print(results_df.to_latex(index=False, float_format="{:0.8}".format))

\begin{tabular}{lllll}
\toprule
Corpus & BRK & BRK2P & BRK4B & BRK7B \\
\midrule
UD-Galician-TreeGal & 512 & 601 & 270 & 376 \\
UD-Lithuanian-HSE & 398 & 432 & 256 & 306 \\
UD-Belarusian-HSE & 1136 & 1479 & 477 & 926 \\
UD-Old-East-Slavic-RNC & 910 & 1181 & 378 & 715 \\
UD-Marathi-UFAL & 275 & 291 & 197 & 223 \\
UD-Welsh-CCG & 474 & 514 & 265 & 312 \\
\bottomrule
\end{tabular}



### Coverage

Extract dependency coverage for bracket encodings (dependency coverage can be undestood as the attachment score)

In [3]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import copy
import numpy as np
import pandas as pd

# read all folders
ud_path="/home/poli/Treebanks/ag20/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["Corpus","BRK", "BRKD", "BRK-2P","BRK-4B","BRK-7B"])

ebrk   = D_BrkBasedEncoding(separator="[_]",   displacement = False)
ebrkd   = D_BrkBasedEncoding(separator="[_]",   displacement = True)
ebrk2p = D_Brk2PBasedEncoding(separator="[_]", displacement = True)
ebrk4b = D_Brk4BitsEncoding(separator="[_]")
ebrk7b = D_Brk7BitsEncoding(separator="[_]")

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    injective = {"brk":0.0, "brkd":0.0, "brk2p":0.0, "brk4b":0.0, "brk7b":0.0}
    
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, 
                                        filter_projective=False)
        total_trees += trees
        for t in trees:
            t_brk = ebrk.encode(copy.deepcopy(t))
            t_brk.remove_dummy()
            t_brk_dec = ebrk.decode(t_brk)
            injective["brk"] += t_brk_dec.las_score(t)

            t_brkd = ebrkd.encode(copy.deepcopy(t))
            t_brkd.remove_dummy()
            t_brkd_dec = ebrkd.decode(t_brkd)
            injective["brkd"] += t_brkd_dec.las_score(t)
            
            t_brk2p = ebrk2p.encode(copy.deepcopy(t))
            t_brk2p.remove_dummy()
            t_brk2p_dec = ebrk2p.decode(t_brk2p)
            injective["brk2p"] += t_brk2p_dec.las_score(t)
            
            t_brk4b = ebrk4b.encode(copy.deepcopy(t))
            t_brk4b_dec = ebrk4b.decode(t_brk4b)
            injective["brk4b"] += t_brk4b_dec.las_score(t)
            
            t_brk7b = ebrk7b.encode(copy.deepcopy(t))
            t_brk7b_dec = ebrk7b.decode(t_brk7b)
            injective["brk7b"] += t_brk7b_dec.las_score(t)
    
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name, injective["brk"]/len(total_trees), injective["brkd"]/len(total_trees), injective["brk2p"]/len(total_trees), injective["brk4b"]/len(total_trees), injective["brk7b"]/len(total_trees)]],
                                                        columns=["Corpus","BRK","BRKD", "BRK-2P","BRK-4B","BRK-7B"])], ignore_index=True)

print(results_df.to_latex(index=False, float_format="{:0.8}".format))

\begin{tabular}{lrrrr}
\toprule
Corpus & BRK & BRK-2P & BRK-4B & BRK-7B \\
\midrule
UD-Galician-TreeGal & 0.99653929 & 0.99993996 & 0.9952312 & 0.99993996 \\
UD-Lithuanian-HSE & 0.99489936 & 0.99985376 & 0.9882175 & 0.99985376 \\
UD-Belarusian-HSE & 0.9976523 & 0.99997075 & 0.99460695 & 0.99997075 \\
UD-Old-East-Slavic-RNC & 0.98842905 & 0.99942381 & 0.97469781 & 0.99942381 \\
UD-Marathi-UFAL & 0.99812002 & 1.0 & 0.99325199 & 1.0 \\
UD-Welsh-CCG & 0.99922433 & 1.0 & 0.99931802 & 1.0 \\
\bottomrule
\end{tabular}



Extract full trees coverage for bracket encodings (trees coverage can be understood as the number of trees decoded w/las=1)

In [5]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import copy
import numpy as np
import pandas as pd

# read all folders
ud_path="/home/poli/Treebanks/d21/"
ud_folders = [os.path.join(ud_path, f) for f in os.listdir(ud_path) if os.path.isdir(os.path.join(ud_path, f))]
results_df = pd.DataFrame(columns=["Corpus","Total Trees","BRK","BRK-2P","BRK-4B","BRK-7B"])

ebrk   = D_BrkBasedEncoding(separator="[_]",   displacement = True)
ebrk2p = D_Brk2PBasedEncoding(separator="[_]", displacement = True)
ebrk4b = D_Brk4BitsEncoding(separator="[_]")
ebrk7b = D_Brk7BitsEncoding(separator="[_]")

for ud_folder in ud_folders:
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    conllu_files = [os.path.join(ud_folder, f) for f in os.listdir(ud_folder) if f.endswith(".conllu")]
    injective = {"brk":0.0, "brk2p":0.0, "brk4b":0.0, "brk7b":0.0}
    
    total_trees = []
    for conllu_file in conllu_files:
        deps_treebank = os.path.join(ud_folder, conllu_file)
        trees = D_Tree.read_conllu_file(deps_treebank, 
                                        filter_projective=False)
        total_trees += trees
        for t in trees:
            t_brk = ebrk.encode(copy.deepcopy(t))
            t_brk.remove_dummy()
            t_brk_dec = ebrk.decode(t_brk)
            injective["brk"] += 1 if t_brk_dec.las_score(t)==1 else 0
            
            t_brk2p = ebrk2p.encode(copy.deepcopy(t))
            t_brk2p.remove_dummy()
            t_brk2p_dec = ebrk2p.decode(t_brk2p)
            injective["brk2p"] += 1 if t_brk2p_dec.las_score(t)==1 else 0
            
            t_brk4b = ebrk4b.encode(copy.deepcopy(t))
            t_brk4b_dec = ebrk4b.decode(t_brk4b)
            injective["brk4b"] += 1 if t_brk4b_dec.las_score(t)==1 else 0
            
            t_brk7b = ebrk7b.encode(copy.deepcopy(t))
            t_brk7b_dec = ebrk7b.decode(t_brk7b)
            injective["brk7b"] += 1 if t_brk7b_dec.las_score(t)==1 else 0
    
    treebank_name = (ud_folder.split("/")[-1]).replace("_","-")
    results_df = pd.concat([results_df, pd.DataFrame([[treebank_name,len(total_trees),injective["brk"]/len(total_trees), injective["brk2p"]/len(total_trees), injective["brk4b"]/len(total_trees), injective["brk7b"]/len(total_trees)]],
                                                        columns=["Corpus","Total Trees","BRK","BRK-2P","BRK-4B","BRK-7B"])], ignore_index=True)

print(results_df.to_latex(index=False, float_format="{:0.8}".format))

\begin{tabular}{llrrrr}
\toprule
Corpus & Total Trees & BRK & BRK-2P & BRK-4B & BRK-7B \\
\midrule
UD-Galician-TreeGal & 1000 & 0.945 & 0.945 & 0.945 & 0.945 \\
UD-Lithuanian-HSE & 263 & 0.94676806 & 0.94676806 & 0.94676806 & 0.94676806 \\
UD-Belarusian-HSE & 25231 & 0.97915263 & 0.97915263 & 0.97915263 & 0.97915263 \\
UD-Old-East-Slavic-RNC & 1070 & 0.81962617 & 0.81962617 & 0.81962617 & 0.81962617 \\
UD-Marathi-UFAL & 466 & 0.99141631 & 0.99141631 & 0.99141631 & 0.99141631 \\
UD-Welsh-CCG & 2338 & 0.99059025 & 0.99059025 & 0.99059025 & 0.99059025 \\
\bottomrule
\end{tabular}



Find trees covered by 4bits but not brk

In [9]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import copy
import numpy as np
import pandas as pd

# conllu_file = "/home/poli/Treebanks/20ag/PTB/ptb-train.conllu"
conllu_file = "/home/poli/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"

ebrk   = D_BrkBasedEncoding(separator="[_]", displacement = False)
ebrk4b = D_Brk4BitsEncoding(separator="[_]")

total_trees = []
trees = D_Tree.read_conllu_file(conllu_file, filter_projective=False)

for i,t in enumerate(trees):
    t_brk = ebrk.encode(copy.deepcopy(t))
    t_brk.remove_dummy()
    t_brk_dec = ebrk.decode(t_brk)
    
    t_brk_4b = ebrk4b.encode(copy.deepcopy(t))
    t_brk_4b_dec = ebrk4b.decode(t_brk_4b)
    
    if t.las_score(t_brk_dec) == 1 and t.las_score(t_brk_4b_dec) != 1:
        print("Tree with BRK but not with BRK-4B: ",i)
        t.remove_dummy()
        for gold_node, pred_node_4b, pred_node_brk in zip(t.nodes, t_brk_4b_dec.nodes, t_brk_dec.nodes):
            print(gold_node.id, gold_node.head,"||", pred_node_4b.id, pred_node_4b.head, "||", pred_node_brk.id, pred_node_brk.head)
            print("=====================================")

Tree with BRK but not with BRK-4B:  1311
1 17 || 1 17 || 1 17
2 9 || 2 9 || 2 9
3 9 || 3 9 || 3 9
4 5 || 4 5 || 4 5
5 3 || 5 3 || 5 3
6 7 || 6 7 || 6 7
7 3 || 7 3 || 7 3
8 9 || 8 9 || 8 9
9 1 || 9 1 || 9 1
10 17 || 10 17 || 10 17
11 16 || 11 16 || 11 16
12 14 || 12 14 || 12 14
13 14 || 13 14 || 13 14
14 11 || 14 11 || 14 11
15 16 || 15 16 || 15 16
16 17 || 16 17 || 16 17
17 0 || 17 16 || 17 0
18 19 || 18 19 || 18 19
19 16 || 19 0 || 19 16
20 21 || 20 21 || 20 21
21 19 || 21 19 || 21 19
22 23 || 22 23 || 22 23
23 19 || 23 19 || 23 19
24 23 || 24 23 || 24 23
25 26 || 25 26 || 25 26
26 23 || 26 23 || 26 23
27 30 || 27 30 || 27 30
28 30 || 28 30 || 28 30
29 30 || 29 30 || 29 30
30 26 || 30 26 || 30 26
31 19 || 31 19 || 31 19


In [8]:
conllu_file = "/home/poli/Treebanks/20ag/UD_English-EWT/en_ewt-ud-train.conllu"
trees_idxs = [1311]

ebrk   = D_BrkBasedEncoding(separator="[_]", displacement = False)
ebrk4b = D_Brk4BitsEncoding(separator="[_]")
trees = D_Tree.read_conllu_file(conllu_file, filter_projective = False)

for i in trees_idxs:
    t = trees[i]

    t_brk = ebrk.encode(copy.deepcopy(t))
    t_brk.remove_dummy()
    t_brk_dec = ebrk.decode(t_brk)

    t_brk_4b = ebrk4b.encode(copy.deepcopy(t))
    t_brk_4b_dec = ebrk4b.decode(t_brk_4b)
    t_brk_4b_dec.add_dummy_root()

    print(t)
    print(D_Tree.to_latex(t, include_col=True, additional_labels=['/']+[lbl.xi for lbl in t_brk_4b.labels]))
    print(D_Tree.to_latex(t, include_col=True, additional_labels=[lbl.xi for lbl in t_brk.labels]))
    print(D_Tree.to_latex(t_brk_4b_dec, include_col=True, additional_labels=[lbl.xi for lbl in t_brk_4b.labels]))

0	-ROOT-	_	-ROOT-	_	_	0	-NOREL-	_	_
1	Now	now	ADV	RB	_	17	advmod	17:advmod	_
2	that	that	SCONJ	IN	_	9	mark	9:mark	_
3	Afghanistan	Afghanistan	PROPN	NNP	Number=Sing	9	nsubj	9:nsubj	SpaceAfter=No
4	,	,	PUNCT	,	_	5	punct	5:punct	_
5	Iraq	Iraq	PROPN	NNP	Number=Sing	3	conj	3:conj:and|9:nsubj	_
6	and	and	CCONJ	CC	_	7	cc	7:cc	_
7	Libya	Libya	PROPN	NNP	Number=Sing	3	conj	3:conj:and|9:nsubj	_
8	are	be	AUX	VBP	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	9	cop	9:cop	_
9	out	out	ADV	RB	_	1	ccomp	1:ccomp	SpaceAfter=No
10	,	,	PUNCT	,	_	17	punct	17:punct	_
11	two	two	NUM	CD	NumType=Card	16	nummod	16:nummod	_
12	and	and	CCONJ	CC	_	14	cc	14:cc	_
13	a	a	DET	DT	Definite=Ind|PronType=Art	14	det	14:det	_
14	half	half	NOUN	NN	Number=Sing|NumType=Frac	11	conj	11:conj:and	_
15	terrorist	terrorist	ADJ	JJ	Degree=Pos	16	amod	16:amod	_
16	states	state	NOUN	NNS	Number=Plur	17	nsubj	17:nsubj	_
17	remain	remain	VERB	VBP	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	SpaceAfter=No
18	:	

### Hexatag

In [None]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
from codelin.models.const_tree import C_Tree
from nltk.tree import Tree

def pt(t, d=True):
    if d:
        if type(t) is list:
            for i in t:
                print(i)
            for i in t:
                Tree.fromstring(str(i)).pretty_print()
        else:
            Tree.fromstring(str(t)).pretty_print()

path = "/home/droca1/Treebanks/20ag/UD_English-EWT/en_ewt-ud-test.conllu"
encoder = D_Brk4BitsEncoding(separator = "[_]")
trees = D_Tree.read_conllu_file(path, filter_projective=True)
sample = trees[0]

print(sample)
bht = D_Tree.to_bht(sample)
print(bht)
dec_tree = D_Tree.from_bht(bht)
print(dec_tree)


### Encode and generate machamp config for training

Clean multi-expression lines

In [1]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
import re

treebank_path="/home/poli/Treebanks/d21/"
treebank_folders = [os.path.join(treebank_path, f) for f in os.listdir(treebank_path) if os.path.isdir(os.path.join(treebank_path, f))]
mtl = [True, False]
encoder = D_Brk4BitsEncoding(separator="[_]")


for tb_f in treebank_folders:
    print("[INFO] Processing",tb_f)
    # get all conllu files
    treebank_name = (tb_f.split("/")[-1])
    conllu_files = [os.path.join(tb_f, f) for f in os.listdir(tb_f) if (f.endswith(".conllu") and 'test' in f)]
    
    train_file = None
    dev_file = None

    # encode
    for conllu_file in conllu_files:
        print("[INFO] Cleaning",conllu_file)
        deps_treebank = os.path.join(tb_f, conllu_file)
        output_file = os.path.join(tb_f, conllu_file)
        
        with open(deps_treebank, "r") as f:
            lines = f.readlines()
        
        with open(deps_treebank, "w") as f:
            for line in lines:
                if re.match(r"^\d+-.*", line):
                    continue
                f.write(line)

[INFO] Processing /home/poli/Treebanks/d21/UD_Galician-TreeGal
[INFO] Cleaning /home/poli/Treebanks/d21/UD_Galician-TreeGal/gl_treegal-ud-test.conllu
[INFO] Processing /home/poli/Treebanks/d21/UD_Lithuanian-HSE
[INFO] Cleaning /home/poli/Treebanks/d21/UD_Lithuanian-HSE/lt_hse-ud-test.conllu
[INFO] Processing /home/poli/Treebanks/d21/UD_Belarusian-HSE
[INFO] Cleaning /home/poli/Treebanks/d21/UD_Belarusian-HSE/be_hse-ud-test.conllu
[INFO] Processing /home/poli/Treebanks/d21/UD_Old_East_Slavic-RNC
[INFO] Cleaning /home/poli/Treebanks/d21/UD_Old_East_Slavic-RNC/orv_rnc-ud-test.conllu
[INFO] Processing /home/poli/Treebanks/d21/UD_Marathi-UFAL
[INFO] Cleaning /home/poli/Treebanks/d21/UD_Marathi-UFAL/mr_ufal-ud-test.conllu
[INFO] Processing /home/poli/Treebanks/d21/UD_Welsh-CCG
[INFO] Cleaning /home/poli/Treebanks/d21/UD_Welsh-CCG/cy_ccg-ud-test.conllu


Perform 80% split of treebanks without dev set

In [5]:
# perform holdout of 20%
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os
treebank_path = "/home/poli/Treebanks/d21/UD_Galician-TreeGal/gl_treegal-ud-train.backup"
trees = D_Tree.read_conllu_file(treebank_path, filter_projective=False)

trees_train = trees[:int(len(trees)*0.8)]
trees_dev = trees[int(len(trees)*0.8):]

with open("/home/poli/Treebanks/d21/UD_Galician-TreeGal/gl_treegal-ud-train.conllu", "w") as f:
        for t in trees_train:
                t.remove_dummy()
                f.write(str(t))
with open("/home/poli/Treebanks/d21/UD_Galician-TreeGal/gl_treegal-ud-dev.conllu", "w") as f:
        for t in trees_dev:
                t.remove_dummy()
                f.write(str(t))


Encode and generate machamp configuration

In [2]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
import os

import json

config_singletask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "label":{
                "task_type":"seq",
                "column_idx":2
            }
        }
    }
}

config_multitask = {
    "dependency":{
        "train_data_path":"XXX",
        "dev_data_path":"XXX",
        "word_idx":0,
        "tasks":{
            "brk":{
                "task_type":"seq",
                "column_idx":2
            },
            "reltype":{
                "task_type":"seq",
                "column_idx":3
            }
        }
    }
}

treebank_path = "/home/poli/Treebanks/d21/"
treebank_folders = [os.path.join(treebank_path, f) for f in os.listdir(treebank_path) if os.path.isdir(os.path.join(treebank_path, f))]
mtl = [True, False]

brk_bs = D_BrkBasedEncoding(separator="[_]", displacement=False)
brk_2p = D_Brk2PBasedEncoding(separator="[_]", displacement=False)
brk_4b = D_Brk4BitsEncoding(separator="[_]")
brk_7b = D_Brk7BitsEncoding(separator="[_]")

encodings = [brk_bs, brk_2p, brk_4b, brk_7b]
filter_projective = False

for encoder in encodings:
    print("[INFO] Encoding with", encoder.__class__.__name__)

    for tb_f in treebank_folders:
        print("[INFO] Processing",tb_f)
        # get all conllu files
        treebank_name = (tb_f.split("/")[-1])
        conllu_files = [os.path.join(tb_f, f) for f in os.listdir(tb_f) if (f.endswith(".conllu"))]
        
        train_file = ""
        dev_file = ""

        # encode
        for conllu_file in conllu_files:
            deps_treebank = os.path.join(tb_f, conllu_file)
            output_file = os.path.join(tb_f, conllu_file)
            
            target_extension = "_"+encoder.__class__.__name__+".labels"
            output_file = output_file.replace(".conllu", target_extension)  
            
            if "train" in output_file:
                train_file = output_file
            elif "dev" in output_file:
                dev_file = output_file
            
            trees = D_Tree.read_conllu_file(deps_treebank, 
                                            filter_projective=filter_projective)

            with open(output_file, "w") as f:
                for tree in trees:
                    lin_tree = encoder.encode(tree)
                    f.write(lin_tree.to_string(f_idx_dict=None, 
                                            add_bos_eos=True, 
                                            separate_columns=True) +"\n")
            
            # save a clean test
            if 'test' in conllu_file and filter_projective:
                output_file = output_file.replace(".labels", "-clean.conllu")
                with open(output_file, "w") as f:
                    for tree in trees:
                        tree.remove_dummy()
                        f.write("# text = "+tree.get_sentence()+"\n")
                        f.write(str(tree))
        
        current_config = config_multitask.copy()
        current_config["dependency"]["train_data_path"] = train_file.replace('poli', 'diego.roca')
        current_config["dependency"]["dev_data_path"] = dev_file.replace('poli', 'diego.roca')

        config_name = "config_"+encoder.__class__.__name__+".json"
        with open(os.path.join(tb_f, config_name), "w") as f:
            json.dump(current_config, f, indent=4)

[INFO] Encoding with D_BrkBasedEncoding
[INFO] Processing /home/poli/Treebanks/d21/UD_Galician-TreeGal
[INFO] Processing /home/poli/Treebanks/d21/UD_Lithuanian-HSE
[INFO] Processing /home/poli/Treebanks/d21/UD_Belarusian-HSE
[INFO] Processing /home/poli/Treebanks/d21/UD_Old_East_Slavic-RNC
[INFO] Processing /home/poli/Treebanks/d21/UD_Marathi-UFAL
[INFO] Processing /home/poli/Treebanks/d21/UD_Welsh-CCG
[INFO] Encoding with D_Brk2PBasedEncoding
[INFO] Processing /home/poli/Treebanks/d21/UD_Galician-TreeGal
[INFO] Processing /home/poli/Treebanks/d21/UD_Lithuanian-HSE
[INFO] Processing /home/poli/Treebanks/d21/UD_Belarusian-HSE
[INFO] Processing /home/poli/Treebanks/d21/UD_Old_East_Slavic-RNC
[INFO] Processing /home/poli/Treebanks/d21/UD_Marathi-UFAL
[INFO] Processing /home/poli/Treebanks/d21/UD_Welsh-CCG
[INFO] Encoding with D_Brk4BitsEncoding
[INFO] Processing /home/poli/Treebanks/d21/UD_Galician-TreeGal
[INFO] Processing /home/poli/Treebanks/d21/UD_Lithuanian-HSE
[INFO] Processing /hom