In [3]:
from nltk.tree import Tree

In [4]:
tree_string = '(S (NP (DT The) (NNS owls)) (VP (VBP are) (RB not) (SBAR (WHNP (WP what)) (S (NP (PRP they)) (VP (VBP seem))))) (PUNCT .))'
tree = Tree.fromstring(tree_string)
tree.pretty_print()

                  S                          
      ____________|_______________________    
     |                VP                  |  
     |         _______|____               |   
     |        |   |       SBAR            |  
     |        |   |    ____|____          |   
     |        |   |   |         S         |  
     |        |   |   |     ____|___      |   
     NP       |   |  WHNP  NP       VP    |  
  ___|___     |   |   |    |        |     |   
 DT     NNS  VBP  RB  WP  PRP      VBP  PUNCT
 |       |    |   |   |    |        |     |   
The     owls are not what they     seem   .  



Naive absolute encodes the number of commons between words w<sub>i</sub> and w<sub>i+1</sub>. This number is in the label associated to w<sub>i</sub>.

In [5]:
from src.encs.enc_const.naive_absolute import C_NaiveAbsoluteEncoding
from src.models.const_tree import ConstituentTree

encoder = C_NaiveAbsoluteEncoding(separator="_", unary_joiner="+")
constituent_tree = ConstituentTree.from_string(tree_string)
w, p, l, f = encoder.encode(constituent_tree)

for i in range(len(w)):
    n, lc = l[i].n_commons, l[i].last_common
    print("Word '{}' has {} common ancestors with his next word. The last common ancestor is {}".format(w[i], n, lc))


Word 'The' has 2 common ancestors with his next word. The last common ancestor is NP
Word 'owls' has 1 common ancestors with his next word. The last common ancestor is S
Word 'are' has 2 common ancestors with his next word. The last common ancestor is VP
Word 'not' has 2 common ancestors with his next word. The last common ancestor is VP
Word 'what' has 3 common ancestors with his next word. The last common ancestor is SBAR
Word 'they' has 4 common ancestors with his next word. The last common ancestor is S
Word 'seem' has 1 common ancestors with his next word. The last common ancestor is S
Word '.' has 1 common ancestors with his next word. The last common ancestor is S


We want to encode in the word w<sub>i</sub> the number of commons between word w<sub>i-1</sub> and w<sub>i</sub>.

```
Word 'The' has 1 common ancestors with his previous word. The last common ancestor is S.
Word 'owls' has 2 common ancestors with his previous word. The last common ancestor is NP.
Word 'are' has 1 common ancestors with his previous word. The last common ancestor is S.
Word 'not' has 2 common ancestors with his previous word. The last common ancestor is VP.
Word 'what' has 2 common ancestors with his previous word. The last common ancestor is VP.
Word 'they' has 3 common ancestors with his previous word. The last common ancestor is SBAR.
Word 'seem' has 4 common ancestors with his previous word. The last common ancestor is S.
Word '.' has 1 common ancestors with his previous word. The last common ancestor is S.
(...)
```


As we will have the whole tree during the decoding process, we could also reverse the order of the linearized tree rows and implement decoding backwards.

In [8]:
from src.utils.constants import C_STRAT_MAX
from src.encs.enc_const.naive_incremental import C_NaiveIncrementalEncoding
tree_string = '(S (NP (DT The) (NNS owls)) (VP (VBP are) (RB not) (SBAR (WHNP (WP what)) (S (NP (PRP they)) (VP (VBP seem))))) (PUNCT .))'
tree = Tree.fromstring(tree_string)

print("\n>> Original Tree")
tree.pretty_print()

incr_enc = C_NaiveIncrementalEncoding(separator="_", unary_joiner="+")
constituent_tree = ConstituentTree.from_string(tree_string)

w, p, l, f = incr_enc.encode(constituent_tree)
linearized_tree = [(wi, pi, li) for wi, pi, li, fi in zip(w, p, l, f)]

print("\n>> Linearized Tree\n")
for line in linearized_tree:
    w, p, l = line
    nc, lc = l.n_commons, l.last_common
    print("Word '{}' has {} common ancestors with his next word. The last common ancestor is {}".format(w, nc, lc))
decoded_tree = incr_enc.decode(linearized_tree)
decoded_tree.postprocess_tree(conflict_strat=C_STRAT_MAX, clean_nulls=True)
tree = Tree.fromstring(str(decoded_tree))

print("\n>> Decoded Tree")
tree.pretty_print()


>> Original Tree
                  S                          
      ____________|_______________________    
     |                VP                  |  
     |         _______|____               |   
     |        |   |       SBAR            |  
     |        |   |    ____|____          |   
     |        |   |   |         S         |  
     |        |   |   |     ____|___      |   
     NP       |   |  WHNP  NP       VP    |  
  ___|___     |   |   |    |        |     |   
 DT     NNS  VBP  RB  WP  PRP      VBP  PUNCT
 |       |    |   |   |    |        |     |   
The     owls are not what they     seem   .  


>> Linearized Tree



AttributeError: 'ConstituentLabel' object has no attribute 'split'