## Dependency Linearization Playground

In [1]:
from codelin.encs.enc_deps import *
from codelin.models.deps_tree import D_Tree
from codelin.models.linearized_tree import LinearizedTree

deps_treebank = "/home/poli/Treebanks/deps/UD_ENGLISH_EWT/test.conllu"
trees = D_Tree.read_conllu_file(deps_treebank, filter_projective=True)

sample_tree = trees[0]
print(sample_tree)
print(D_Tree.to_latex(sample_tree))

0	-ROOT-	_	-ROOT-	_	_	0	-NOREL-	_	_
1	What	what	PRON	WP	['PronType=Int']	0	root	0:root	_
2	if	if	SCONJ	IN	[None]	4	mark	4:mark	_
3	Google	Google	PROPN	NNP	['Number=Sing']	4	nsubj	4:nsubj	_
4	Morphed	morph	VERB	VBD	['Mood=Ind', 'Number=Sing', 'Person=3', 'Tense=Past', 'VerbForm=Fin']	1	advcl	1:advcl:if	_
5	Into	into	ADP	IN	[None]	6	case	6:case	_
6	GoogleOS	GoogleOS	PROPN	NNP	['Number=Sing']	4	obl	4:obl:into	SpaceAfter=No
7	?	?	PUNCT	.	[None]	4	punct	4:punct	_


\begin{dependency}[theme = simple]
\begin{deptext}[row sep=.25em, column sep=1.5em]
$w_i$ \& -ROOT- \& What \& if \& Google \& Morphed \& Into \& GoogleOS \& ? \\ 
\end{deptext}
\depedge{2}{3}{root}
\depedge{6}{4}{mark}
\depedge{6}{5}{nsubj}
\depedge{3}{6}{advcl}
\depedge{8}{7}{case}
\depedge{6}{8}{obl}
\depedge{6}{9}{punct}
\end{dependency}



<div style="text-align:center"><img src="./pics/notebooks/d_tree_1.png" /></div>

### Encode with 4-bits encoding

In [2]:
enc_4b = D_Brk4BitsEncoding(separator="_")
lin_tree = enc_4b.encode(sample_tree)
dec_tree = enc_4b.decode(lin_tree)
print("LAS =",dec_tree.las_score(sample_tree))

LAS = 1.0


### Test for all trees

In [3]:
tree = trees[1816]
D_Tree.short_print(tree)
lin_tree = enc_4b.encode(tree)
print("=====================================")
print(lin_tree)
print("=====================================")
dec_tree = enc_4b.decode(lin_tree)
D_Tree.short_print(dec_tree)
print("LAS =",dec_tree.las_score(tree))

0 		 -ROOT- 		 0 		 -NOREL-
1 		 You 		 3 		 nsubj
2 		 'll 		 3 		 aux
3 		 have 		 0 		 root
4 		 to 		 5 		 mark
5 		 drive 		 3 		 xcomp
6 		 10 		 7 		 nummod
7 		 miles 		 9 		 nmod:npmod
8 		 down 		 9 		 case
9 		 75 		 5 		 obl
10 		 to 		 11 		 case
11 		 Allen 		 5 		 obl
12 		 . 		 3 		 punct
-BOS-	-BOS-	-BOS-
-ROOT-	-ROOT-	/_-NOREL-
You	PRON	<*_nsubj
'll	AUX	<_aux
have	VERB	\>*/_root
to	PART	<*_mark
drive	VERB	\>/_xcomp
10	NUM	<*_nummod
miles	NOUN	\<*_nmod:npmod
down	ADP	<_case
75	NUM	\>_obl
to	ADP	<*_case
Allen	PROPN	\>*_obl
.	PUNCT	>*_punct
-EOS-	-EOS-	-EOS-

1 		 You 		 3 		 nsubj
2 		 'll 		 3 		 aux
3 		 have 		 0 		 root
4 		 to 		 5 		 mark
5 		 drive 		 3 		 xcomp
6 		 10 		 7 		 nummod
7 		 miles 		 9 		 nmod:npmod
8 		 down 		 9 		 case
9 		 75 		 5 		 obl
10 		 to 		 11 		 case
11 		 Allen 		 5 		 obl
12 		 . 		 3 		 punct
LAS = 1.0


In [5]:
enc_4b = D_Brk4BitsEncoding(separator="_")
for i, sample_tree in enumerate(trees):
    lin_tree = enc_4b.encode(sample_tree)
    dec_tree = enc_4b.decode(lin_tree)
    las = dec_tree.las_score(sample_tree)
    
    if las != 1:
        print("Error at tree",i,"length",len(sample_tree))
        print("LAS =",dec_tree.las_score(sample_tree))