In [1]:
from stanza.server import CoreNLPClient

In [2]:
class ParseTreeNode:
    def __init__(self,val):
        self.value = val
        self.children = []
        self.height = 0
        self.parent = None
        #attributes for generating S and hierarchical positional embeddings
        self.leaf_order_idx = -1 #0 indexed
        self.leaf_list = []
        
    def add_child(self,child):
        self.children.append(child)
        child.parent = self
    
def process_tree(root):
    current = ParseTreeNode(root.value)
    for ch in root.child:
        child = process_tree(ch)
        current.add_child(child)
        current.height = max(current.height,child.height + 1)
        child.parent = current
    return current

In [None]:
import os
import pickle

data_base_path = '../Data/parallel_data/'
with CoreNLPClient(
        annotators=['tokenize','parse'],#['tokenize','ssplit','pos','lemma','ner', 'parse', 'depparse','coref'],
        timeout=30000,
        memory='16G') as client:
    for file in os.listdir(data_base_path):
        if 'en' not in file or 'loss_fn' in file:
            continue
        trees = []
        print('FILE::',file)
        with open(data_base_path + file,'r') as f,open(data_base_path + 'trees/' + file[:-12]+'tree.pickle','wb') as g:
            for line in f:
                line = line.replace('_','') #else it breaks _FILE to _ and FILE
                line_parseTree = client.annotate(line.strip()).sentence[0].parseTree
                parseTree = process_tree(line_parseTree)
                trees.append(parseTree)
            pickle.dump(trees,g)

In [4]:
def printTree(tree,tabs=1):
    print('\t'*tabs,tree.value,tree.height)
    if tree.parent is not None:
        print(tree.parent.value)
    for ch in tree.children:
        printTree(ch,tabs+1)

# printTree(tree)

In [5]:
printTree(trees[0])

	 ROOT 8
		 S 7
ROOT
			 VP 6
S
				 VB 1
VP
					 send 0
VB
				 NP 2
VP
					 JJ 1
NP
						 sigkill 0
JJ
					 NN 1
NP
						 signal 0
NN
				 PP 5
VP
					 IN 1
PP
						 to 0
IN
					 NP 4
PP
						 NP 2
NP
							 NN 1
NP
								 process 0
NN
							 NN 1
NP
								 id 0
NN
							 NN 1
NP
								 NUMBER 0
NN
						 VP 3
NP
							 VBG 1
VP
								 killing 0
VBG
							 NP 2
VP
								 PRP 1
NP
									 it 0
PRP
							 ADVP 2
VP
								 RB 1
ADVP
									 instantly 0
RB


In [72]:
printTree(process_tree(line_parseTree))

	 ROOT 12
		 S 11
			 VP 10
				 VB 1
					 list 0
				 NP 2
					 DT 1
						 all 0
					 NNS 1
						 files 0
				 PP 9
					 IN 1
						 in 0
					 NP 8
						 NP 2
							 JJ 1
								 current 0
							 NN 1
								 directory 0
							 NN 1
								 tree 0
						 SBAR 7
							 WHNP 2
								 WDT 1
									 that 0
							 S 6
								 VP 5
									 VBD 1
										 were 0
									 ADVP 2
										 RB 1
											 last 0
									 VP 4
										 VBN 1
											 modified 0
										 PP 3
											 IN 1
												 between 0
											 NP 2
												 NNP 1
													 REGEX 0
												 CC 1
													 and 0
												 NNP 1
													 DATETIME 0
