In [1]:
from nltk.tree import Tree

In [2]:
tree_string = '(S (NP (DT The) (NNS owls)) (VP (VBP are) (RB not) (SBAR (WHNP (WP what)) (S (NP (PRP they)) (VP (VBP seem))))) (PUNCT .))'
tree = Tree.fromstring(tree_string)
tree.pretty_print()

                  S                          
      ____________|_______________________    
     |                VP                  |  
     |         _______|____               |   
     |        |   |       SBAR            |  
     |        |   |    ____|____          |   
     |        |   |   |         S         |  
     |        |   |   |     ____|___      |   
     NP       |   |  WHNP  NP       VP    |  
  ___|___     |   |   |    |        |     |   
 DT     NNS  VBP  RB  WP  PRP      VBP  PUNCT
 |       |    |   |   |    |        |     |   
The     owls are not what they     seem   .  



Naive absolute encodes the number of commons between words w<sub>i</sub> and w<sub>i+1</sub>. This number is in the label associated to w<sub>i</sub>.

In [3]:
from src.encs.enc_const.naive_absolute import C_NaiveAbsoluteEncoding
from src.models.const_tree import ConstituentTree

encoder = C_NaiveAbsoluteEncoding(separator="_", unary_joiner="+")
constituent_tree = ConstituentTree.from_string(tree_string)
w, p, l, f = encoder.encode(constituent_tree)

for i in range(len(w)):
    n, lc = l[i].n_commons, l[i].last_common
    print("Word '{}' has {} common ancestors with his next word. The last common ancestor is {}".format(w[i], n, lc))


Word 'The' has 2 common ancestors with his next word. The last common ancestor is NP
Word 'owls' has 1 common ancestors with his next word. The last common ancestor is S
Word 'are' has 2 common ancestors with his next word. The last common ancestor is VP
Word 'not' has 2 common ancestors with his next word. The last common ancestor is VP
Word 'what' has 3 common ancestors with his next word. The last common ancestor is SBAR
Word 'they' has 4 common ancestors with his next word. The last common ancestor is S
Word 'seem' has 1 common ancestors with his next word. The last common ancestor is S
Word '.' has 1 common ancestors with his next word. The last common ancestor is S


We want to encode in the word w<sub>i</sub> the number of commons between word w<sub>i-1</sub> and w<sub>i</sub>.

```
Word 'The' has 1 common ancestors with his previous word. The last common ancestor is S.
Word 'owls' has 2 common ancestors with his previous word. The last common ancestor is NP.
Word 'are' has 1 common ancestors with his previous word. The last common ancestor is S.
Word 'not' has 2 common ancestors with his previous word. The last common ancestor is VP.
Word 'what' has 2 common ancestors with his previous word. The last common ancestor is VP.
Word 'they' has 3 common ancestors with his previous word. The last common ancestor is SBAR.
Word 'seem' has 4 common ancestors with his previous word. The last common ancestor is S.
Word '.' has 1 common ancestors with his previous word. The last common ancestor is S.
(...)
```


In [5]:
from src.encs.abstract_encoding import ACEncoding
from src.utils.constants import C_ABSOLUTE_ENCODING, C_ROOT_LABEL, C_CONFLICT_SEPARATOR, C_NONE_LABEL, C_DUMMY_START
from src.models.const_label import ConstituentLabel
from src.models.const_tree import ConstituentTree

import re

class C_NaiveIncremental(ACEncoding):
    def __init__(self, separator, unary_joiner):
        self.separator = separator
        self.unary_joiner = unary_joiner

    def get_unary_chain(self, postag):
        unary_chain = None
        leaf_unary_chain = postag.split(self.unary_joiner)

        if len(leaf_unary_chain)>1:
            unary_list = []
            for element in leaf_unary_chain[:-1]:
                unary_list.append(element.split("##")[0])

            unary_chain = self.unary_joiner.join(unary_list)
            postag = leaf_unary_chain[len(leaf_unary_chain)-1]
        
        return unary_chain, postag
    
    def get_features(self, node, feature_marker="##", feature_splitter="|"):
        postag_split = node.split(feature_marker)
        feats = None

        if len(postag_split) > 1:
            postag = re.sub(r'[0-9]+', '', postag_split[0])
            feats = postag_split[1].split(feature_splitter)
        else:
            postag = re.sub(r'[0-9]+', '', node)
        return postag, feats
    
    def clean_last_common(self, node, feature_marker="##"):
        node = re.sub(r'[0-9]+', '', node)
        last_common = node.split(feature_marker)[0]
        return last_common

    def encode(self, constituent_tree):
        leaf_paths = constituent_tree.path_to_leaves(collapse_unary=True, unary_joiner=self.unary_joiner, dummy=C_DUMMY_START)
        #print(leaf_paths)
        labels=[]
        words=[]
        postags=[]
        additional_feats=[]

        # reverse the paths
        leaf_paths.reverse()

        for i in range(1, len(leaf_paths)):
            path_a = leaf_paths[i-1]
            path_b = leaf_paths[i]
            
            last_common = ""
            n_commons   = 0

            # reverse the paths
            print("comparing {} with {}".format(path_a, path_b))
            for a,b in zip(path_a, path_b):
                if (a!=b):
                    # Remove the digits and aditional feats in the last common node
                    last_common = self.clean_last_common(last_common)

                    # Get word and POS tag
                    word   = path_a[-1]
                    postag = path_a[-2]
                    
                    # Build the Leaf Unary Chain
                    unary_chain, postag = self.get_unary_chain(postag)
                    
                    # Clean the POS Tag and extract additional features
                    postag, feats = self.get_features(postag)

                    # Append the data
                    labels.append(ConstituentLabel(n_commons+1, last_common, unary_chain, C_ABSOLUTE_ENCODING, self.separator, self.unary_joiner))
                    words.append(word)
                    postags.append(postag)
                    additional_feats.append(feats)

                    break
                
                # Store Last Common and increase n_commons 
                # Note: When increasing n_commons use the number from split the collapsed chains
                n_commons  += len(a.split(self.unary_joiner))
                last_common = a
        
        return words, postags, labels, additional_feats

An approach to this could be to get the path to leaves and reverse them, efectivelly using the same algorithm as naive absolute / naive relative but now w<sub>i-1</sub> will be w<sub>i+1</sub>

In [6]:
constituent_tree = ConstituentTree.from_string(tree_string)
path_to_leaves = constituent_tree.path_to_leaves(dummy=C_DUMMY_START)
for p in path_to_leaves:
    print(p)


['S0', '-START-']
['S1', 'NP1', 'DT1', 'The']
['S1', 'NP2', 'NNS2', 'owls']
['S2', 'VP2', 'VBP2', 'are']
['S2', 'VP3', 'RB3', 'not']
['S2', 'VP4', 'SBAR4', 'WHNP+WP4', 'what']
['S2', 'VP4', 'SBAR5', 'S5', 'NP+PRP5', 'they']
['S2', 'VP4', 'SBAR5', 'S6', 'VP+VBP6', 'seem']
['S3', 'PUNCT3', '.']


In [7]:
incr_enc = C_NaiveIncremental(separator="_", unary_joiner="+")
constituent_tree = ConstituentTree.from_string(tree_string)
w, p, l, f = incr_enc.encode(constituent_tree)

# reverse the results
w.reverse()
p.reverse()
l.reverse()
f.reverse()

for i in range(len(w)):
    n, lc = l[i].n_commons, l[i].last_common
    print("Word '{}' has {} common ancestors with his previous word. The last common ancestor is {}".format(w[i], n, lc))

comparing ['S3', 'PUNCT3', '.'] with ['S2', 'VP4', 'SBAR5', 'S6', 'VP+VBP6', 'seem']
comparing ['S2', 'VP4', 'SBAR5', 'S6', 'VP+VBP6', 'seem'] with ['S2', 'VP4', 'SBAR5', 'S5', 'NP+PRP5', 'they']
comparing ['S2', 'VP4', 'SBAR5', 'S5', 'NP+PRP5', 'they'] with ['S2', 'VP4', 'SBAR4', 'WHNP+WP4', 'what']
comparing ['S2', 'VP4', 'SBAR4', 'WHNP+WP4', 'what'] with ['S2', 'VP3', 'RB3', 'not']
comparing ['S2', 'VP3', 'RB3', 'not'] with ['S2', 'VP2', 'VBP2', 'are']
comparing ['S2', 'VP2', 'VBP2', 'are'] with ['S1', 'NP2', 'NNS2', 'owls']
comparing ['S1', 'NP2', 'NNS2', 'owls'] with ['S1', 'NP1', 'DT1', 'The']
comparing ['S1', 'NP1', 'DT1', 'The'] with ['S0', '-START-']
Word 'The' has 1 common ancestors with his previous word. The last common ancestor is 
Word 'owls' has 2 common ancestors with his previous word. The last common ancestor is S
Word 'are' has 1 common ancestors with his previous word. The last common ancestor is 
Word 'not' has 2 common ancestors with his previous word. The last co