In [1]:
from codelin.models.const_tree import C_Tree
from codelin.encs.constituent import *
from codelin.utils.constants import *
from codelin.models.linearized_tree import LinearizedTree
from codelin.models.const_label import C_Label
from nltk.tree import Tree

  from .autonotebook import tqdm as notebook_tqdm


## Development

(0 (A a) (1 (2 (3 (B b) (4 (C c) (5 (6 (D d) (E e)) (F f)))) (G g)) (H h))) <br>
(0 
    (A a) 
    (1 
        (2 
            (3 
                (B b) 
                (4 
                    (C c) 
                    (5 
                        (6 
                            (D d) 
                            (E e)
                        ) 
                    (F f)
                    )
                )
            ) 
        (G g)
        ) 
    (H h)
    )
)

In [2]:
tree_string = "(S (CC But) (NP (PRP we)) (VP (MD can) (VP (VB think) (PP (IN of) (NP (NP (JJ many) (NNS reasons)) (SBAR (S (VP (TO to) (VP (VB stay) (ADJP (RP out)) (PP (IN for) (UCP (NP (DT the) (JJ foreseeable) (NN future)) (CC and) (ADVP (RB well) (IN beyond)))))))))))) (. .))"
tree = C_Tree.from_string(tree_string)
Tree.fromstring(str(tree)).pretty_print()
tree = tree.collapse_unary()
tree = C_Tree.to_binary_right(tree)
Tree.fromstring(str(tree)).pretty_print()

                                            S                                                           
  __________________________________________|_________________________________________________________   
 |   |        VP                                                                                      | 
 |   |    ____|____                                                                                   |  
 |   |   |         VP                                                                                 | 
 |   |   |     ____|___                                                                               |  
 |   |   |    |        PP                                                                             | 
 |   |   |    |     ___|__________                                                                    |  
 |   |   |    |    |              NP                                                                  | 
 |   |   |    |    |         _____|_________       

In [3]:
import re
import copy

def get_features(node, feature_marker="##", feature_splitter="|"):
    postag_split = node.split(feature_marker)
    feats = None

    if len(postag_split) > 1:
        postag = re.sub(r'[0-9]+', '', postag_split[0])
        feats = postag_split[1].split(feature_splitter)
    else:
        postag = re.sub(r'[0-9]+', '', node)
    return postag, feats

def combine(tree, new_child):
    '''
    Replaces a C_NONE_LABEL inside 'tree'
    with new_child
    '''
    # trees should have only 2 child nodes
    if type(new_child) is str:
        new_child = C_Tree(new_child)
    
    current_level = tree
    
    while(not current_level.has_none_child()):
        current_level = current_level.r_child()

    if current_level.children[0].label == C_NONE_LABEL:
        current_level.children[0] = new_child
    elif current_level.children[1].label == C_NONE_LABEL:
        current_level.children[1] = new_child
    return tree

def build_unary_chain(word, postag, unary_chain, unary_joiner):
    if unary_chain:
        unary_chain = unary_chain.split(unary_joiner)
        unary_chain.reverse()
        pos_tree = C_Tree(postag, C_Tree(word))
        for node in unary_chain:
            temp_tree = C_Tree(node, pos_tree)
            pos_tree = temp_tree
    else:
        pos_tree = C_Tree(postag, C_Tree(word))
    return pos_tree

def encode(constituent_tree):
    print("-----------------")
    print(constituent_tree)
    nodes = []
    labels = []
    words = []
    postags = []
    unary_chains = []
    non_terminals = []
    features = []
    # It is needed to collapse unary before binary
    constituent_tree = constituent_tree.collapse_unary()
    constituent_tree = C_Tree.to_binary_right(constituent_tree)
    C_Tree.inorder(constituent_tree,  lambda x: nodes.append(x))
    # Extract info from the tree
    last_uc  = ""
    last_pos = ""
    for n in nodes:
        label_string = ""
        if n.is_unary_chain():
            last_uc = n.label
            continue

        if n.is_preterminal():
            last_pos = n.label
            continue

        if n.is_terminal():
            # get the parent if the parent is a pos tag
            if n.parent is not None and n.parent.is_preterminal():
                pn = n.parent
            else:
                pn = n

            # get the parent if the parent is a unary chain
            if pn.parent is not None and pn.parent.is_unary_chain():
                pn = pn.parent
            else:
                pn = pn

            # check if it is a right or left child
            if pn.is_right_child():
                label_string+="l"
            elif pn.is_left_child() or pn.parent is None:
                label_string+="r"
            
            unary_chains.append(last_uc)
            postag, feats = get_features(last_pos)
            postags.append(postag)
            words.append(n.label)
            features.append(feats)
            
            last_pos = ""
            last_uc  = ""
        
        else:
            if n.is_right_child():
                label_string+="L"
            elif n.is_left_child() or n.parent is None:
                label_string+="R"
            non_terminals.append(n.label)
        labels.append(label_string)
    
    # Merge labels in tuples
    labels_merged = []
    for i in range(0, len(labels), 2):
        if i == len(labels)-1:
            labels_merged.append(labels[i])
        else:
            labels_merged.append(labels[i]+labels[i+1])
    
    # Add a final non terminal if needed
    if len(non_terminals)<len(words):
        non_terminals.append("NONE")
    
    # Create the labels and linearized tree
    c_labels = []
    for i in range(len(words)):
        l_i = C_Label(labels_merged[i], non_terminals[i], unary_chains[i], C_TETRA_ENCODING, "_", "+")
        c_labels.append(l_i)
    lin_tree = LinearizedTree(words, postags, features, c_labels, None)
    return lin_tree


def decode(lc):
    stack = []
    buffer = copy.deepcopy(lc.words)
    tree = None
    for word, postag, feats, label in lc.iterrows():
        a, t, uc = label.n_commons, label.last_common, label.unary_chain
        a1= a[0]
        
        if a1 == "r":
            leaf = buffer.pop(0)
            terminal_tree = build_unary_chain(leaf, postag, uc, "+")
            stack.append(terminal_tree)
        
        if a1 == "l":
            leaf = buffer.pop(0)
            terminal_tree = build_unary_chain(leaf, postag, uc, "+")
            stack[-1] = combine(stack[-1], terminal_tree)

        
        if len(buffer)==0:
            break
        
        a2 = a[1]
        if a2 == "R":
            tree = C_Tree(t, [stack[-1], C_Tree.empty_tree()])
            stack[-1] = tree
            
        if a2 == "L":
            tree = stack.pop()
            tree = C_Tree(t, [tree, C_Tree.empty_tree()])
            stack[-1] = combine(stack[-1], tree)
        
    return stack[0]


In [4]:
    
# tree_string = "(S (CC But) (SBAR (IN while) (S (NP (DT the) (NNP New) (NNP York) (NNP Stock) (NNP Exchange)) (VP (VBD did) (RB n't) (VP (VB fall) (ADVP (RB apart)) (NP (NNP Friday)) (SBAR (IN as) (S (NP (DT the) (NNP Dow) (NNP Jones) (NNP Industrial) (NNP Average)) (VP (VBD plunged) (NP (NP (CD 190.58) (NNS points)) (PRN (: --) (NP (NP (JJS most)) (PP (IN of) (NP (PRP it))) (PP (IN in) (NP (DT the) (JJ final) (NN hour)))) (: --)))))))))) (NP (PRP it)) (ADVP (RB barely)) (VP (VBD managed) (S (VP (TO to) (VP (VB stay) (NP (NP (DT this) (NN side)) (PP (IN of) (NP (NN chaos)))))))) (. .))"
# tree_string = "(S (INTJ (RB No)) (, ,) (NP (PRP it)) (VP (VBD was) (RB n't) (NP (NNP Black) (NNP Monday))) (. .))"
# tree_string = "(S (S (NP (JJ Big) (NN investment) (NNS banks)) (VP (VBD refused) (S (VP (TO to) (VP (VB step) (ADVP (RB up) (PP (TO to) (NP (DT the) (NN plate)))) (S (VP (TO to) (VP (VB support) (NP (DT the) (JJ beleaguered) (NN floor) (NNS traders)) (PP (IN by) (S (VP (VBG buying) (NP (NP (JJ big) (NNS blocks)) (PP (IN of) (NP (NN stock))))))))))))))) (, ,) (NP (NNS traders)) (VP (VBP say)) (. .))"
# tree_string = "(S (NP (NP (NNP Seven) (NNP Big) (NNP Board) (NNS stocks)) (: --) (NP (NP (NNP UAL)) (, ,) (NP (NNP AMR)) (, ,) (NP (NNP BankAmerica)) (, ,) (NP (NNP Walt) (NNP Disney)) (, ,) (NP (NNP Capital) (NNP Cities\/ABC)) (, ,) (NP (NNP Philip) (NNP Morris)) (CC and) (NP (NNP Pacific) (NNP Telesis) (NNP Group))) (: --)) (VP (VP (VBD stopped) (S (VP (NN trading)))) (CC and) (VP (ADVP (RB never)) (VBD resumed))) (. .))"
# tree_string = "(NE USA)"
tree_string = "(S (CC But) (NP (PRP we)) (VP (MD can) (VP (VB think) (PP (IN of) (NP (NP (JJ many) (NNS reasons)) (SBAR (S (VP (TO to) (VP (VB stay) (ADJP (RP out)) (PP (IN for) (UCP (NP (DT the) (JJ foreseeable) (NN future)) (CC and) (ADVP (RB well) (IN beyond)))))))))))) (. .))"
constituent_tree = C_Tree.from_string(tree_string)
Tree.fromstring(str(constituent_tree)).pretty_print()
lt = encode(constituent_tree)
tree = decode(lt)
tree = C_Tree.restore_from_binary(tree)
tree = tree.uncollapse_unary("+")

nltk_t = Tree.fromstring(str(tree))
nltk_t.pretty_print()

print(str(tree)==tree_string)


                                            S                                                           
  __________________________________________|_________________________________________________________   
 |   |        VP                                                                                      | 
 |   |    ____|____                                                                                   |  
 |   |   |         VP                                                                                 | 
 |   |   |     ____|___                                                                               |  
 |   |   |    |        PP                                                                             | 
 |   |   |    |     ___|__________                                                                    |  
 |   |   |    |    |              NP                                                                  | 
 |   |   |    |    |         _____|_________       

## Testing from file

In [7]:
tetraencoder = C_Tetratag(separator="_", unary_joiner="+", reverse=False)
# tree_string = "(S (S (NP (JJ Big) (NN investment) (NNS banks)) (VP (VBD refused) (S (VP (TO to) (VP (VB step) (ADVP (RB up) (PP (TO to) (NP (DT the) (NN plate)))) (S (VP (TO to) (VP (VB support) (NP (DT the) (JJ beleaguered) (NN floor) (NNS traders)) (PP (IN by) (S (VP (VBG buying) (NP (NP (JJ big) (NNS blocks)) (PP (IN of) (NP (NN stock))))))))))))))) (, ,) (NP (NNS traders)) (VP (VBP say)) (. .))"
# tree_string = "(SINV (S (ADVP (RB Once) (RB again)) (-LRB- -LCB-) (NP (DT the) (NNS specialists)) (-RRB- -RCB-) (VP (VBD were) (RB not) (ADJP (JJ able) (S (VP (TO to) (VP (VB handle) (NP (NP (DT the) (NNS imbalances)) (PP (IN on) (NP (NP (DT the) (NN floor)) (PP (IN of) (NP (DT the) (NNP New) (NNP York) (NNP Stock) (NNP Exchange)))))))))))) (, ,) ('' '') (VP (VBD said)) (NP (NP (NNP Christopher) (NNP Pedersen)) (, ,) (NP (NP (JJ senior) (NN vice) (NN president)) (PP (IN at) (NP (NNP Twenty-First) (NNP Securities) (NNP Corp))))) (. .))"
# tree_string = "(S (NP (NP (NNP Seven) (NNP Big) (NNP Board) (NNS stocks)) (: --) (NP (NP (NNP UAL)) (, ,) (NP (NNP AMR)) (, ,) (NP (NNP BankAmerica)) (, ,) (NP (NNP Walt) (NNP Disney)) (, ,) (NP (NNP Capital) (NNP Cities\/ABC)) (, ,) (NP (NNP Philip) (NNP Morris)) (CC and) (NP (NNP Pacific) (NNP Telesis) (NNP Group))) (: --)) (VP (VP (VBD stopped) (S (VP (NN trading)))) (CC and) (VP (ADVP (RB never)) (VBD resumed))) (. .))"
# tree_string = "(S (S (NP (JJ Big) (NN investment) (NNS banks)) (VP (VBD refused) (S (VP (TO to) (VP (VB step) (ADVP (RB up) (PP (TO to) (NP (DT the) (NN plate)))) (S (VP (TO to) (VP (VB support) (NP (DT the) (JJ beleaguered) (NN floor) (NNS traders)) (PP (IN by) (S (VP (VBG buying) (NP (NP (JJ big) (NNS blocks)) (PP (IN of) (NP (NN stock))))))))))))))) (, ,) (NP (NNS traders)) (VP (VBP say)) (. .))"
# tree_string = "(FRAG (SBAR (IN As) (IN in) (: :) (`` ``) (SQ (NP (PRP You)) (VP (VBD went) (NP (VBG ballooning))) (. ?) (. ?) (. !) (. !))))"

# this one has problem with the * in the end
tree_string = "(NP (NP (NN Year)) (VP (VBN ended) (NP (NNP Dec.) (CD 31) (, ,) (CD 1988))) (X (NN \*)))"

gold_tree = C_Tree.from_string(tree_string)
bin_tree = C_Tree.to_binary_right(gold_tree)
Tree.fromstring(str(bin_tree)).pretty_print()
bin_tree = gold_tree.collapse_unary()
bin_tree = C_Tree.to_binary_right(bin_tree)
Tree.fromstring(str(bin_tree)).pretty_print()


lin_tree = tetraencoder.encode(gold_tree)
print(lin_tree)
print("====================================")
decoded_tree = tetraencoder.decode(lin_tree)
print("[G]", gold_tree)
print("[D]", decoded_tree)
Tree.fromstring(str(decoded_tree)).pretty_print()
print(str(decoded_tree) == str(gold_tree))

                    FRAG                               
                     |                                  
                    SBAR                               
  ___________________|___________________               
 |   |   |   |                           SQ            
 |   |   |   |    _______________________|___________   
 |   |   |   |   |        VP             |   |   |   | 
 |   |   |   |   |    ____|______        |   |   |   |  
 |   |   |   |   NP  |           NP      |   |   |   | 
 |   |   |   |   |   |           |       |   |   |   |  
 IN  IN  :   `` PRP VBD         VBG      .   .   .   . 
 |   |   |   |   |   |           |       |   |   |   |  
 As  in  :   `` You went     ballooning  ?   ?   !   ! 

    FRAG+SBAR                                                                                      
  ______|_________                                                                                  
 |            FRAG+SBAR*                                        

In [6]:
dec_tree = tetraencoder.decode(lin_tree)
print(dec_tree)

(FRAG (SBAR (IN As) (IN in) (: :) (`` ``) (SQ (NP (PRP You)) (VP (VBD went) (NP (VBG ballooning))) (. ?) (. ?) (. !) (. !))))
