## Importing dependencies

In [31]:
import json
from order import Order
import os

AttributeError: module '__main__' has no attribute '__file__'

## Loading data
Train, dev and test sets for 6 languages

In [11]:
languages = ['en', 'fr', 'es', 'pt', 'nl', 'it'] # languages for which the model works
train_path, dev_path, test_path = 'data/json/train', 'data/json/dev', 'data/json/test'
trainset, devset, testset = {}, {}, {}

for lng in languages:
    trainset[lng] = json.load(open(os.path.join(train_path, ''.join([lng, '.json']))))
    devset[lng] = json.load(open(os.path.join(dev_path, ''.join([lng, '.json']))))
    testset[lng] = json.load(open(os.path.join(test_path, ''.join([lng, '.json']))))

## Format of each instance

Each instance (in each language) consists of three fields: 

**sent_id:** sentence id in the corpus

**text:** gold-standard text

**tree:** tree structure (most relevant field)

In [14]:
lng = 'en'
devset[lng][0].keys()

dict_keys(['text', 'tree', 'sent_id'])

### Tree Structure

Nodes

In [28]:
tree = devset[lng][0]['tree']
print('===NODES===')
for node in tree['nodes']:
    print(node, tree['nodes'][node], '\n')

===NODES===
3 {'upos': 'VERB', 'head': '0', 'xpos': 'VBZ', 'id': '3', 'lemma': 'come', 'feats': {'Number': 'Sing', 'Tense': 'Pres', 'Mood': 'Ind', 'VerbForm': 'Fin', 'Person': '3'}, 'deps': 'root'} 

4 {'upos': 'DET', 'head': '5', 'xpos': 'DT', 'id': '4', 'lemma': 'the', 'feats': {'PronType': 'Art', 'Definite': 'Def'}, 'deps': 'det'} 

6 {'upos': 'ADP', 'head': '5', 'xpos': 'IN', 'id': '6', 'lemma': 'from', 'feats': {}, 'deps': 'case'} 

5 {'upos': 'PROPN', 'head': '3', 'xpos': 'NNP', 'id': '5', 'lemma': 'AP', 'feats': {'Number': 'Sing'}, 'deps': 'obl'} 

1 {'upos': 'NOUN', 'head': '3', 'xpos': 'NN', 'id': '1', 'lemma': 'story', 'feats': {'Number': 'Sing'}, 'deps': 'nsubj'} 

2 {'upos': 'DET', 'head': '1', 'xpos': 'DT', 'id': '2', 'lemma': 'this', 'feats': {'Number': 'Sing', 'PronType': 'Dem'}, 'deps': 'det'} 

7 {'upos': 'PUNCT', 'head': '3', 'xpos': ':', 'id': '7', 'lemma': ':', 'feats': {}, 'deps': 'punct'} 



Edges

**deps:** label of the dependency edge

**node:** id of the child node

In [29]:
print('===EDGES===')
for head in tree['edges']:
    print(head, tree['edges'][head])

===EDGES===
3 [{'deps': 'nsubj', 'node': '1'}, {'deps': 'obl', 'node': '5'}, {'deps': 'punct', 'node': '7'}]
4 []
6 []
5 [{'deps': 'det', 'node': '4'}, {'deps': 'case', 'node': '6'}]
1 [{'deps': 'det', 'node': '2'}]
2 []
7 []


## Ordering

Instantiate `Order` class, giving the paths for both maximum entropy classifiers (`clf_step1` and `clf_sort_step`)

The `pred_order_id` field is created in each node of the tree, indicating its order in a linearized string

**MAKE SURE TO REMOVE PUNTUATION NODES**

In [None]:
clf_step1 = 'data/models/en_clf_step1.cPickle' # English first maximum entropy classifier
clf_sort_step = 'data/models/en_clf_step2.cPickle' # English second maximum entropy classifier
order = Order(clf_step1, clf_sort_step)

tree = devset['en'][0]['tree']
ordered_tree = order.process(tree)

print('==Non-ordered linearized tree')
print(list(map(lambda node: tree['nodes'][node]['lemma'], tree['nodes'])))

print('==Ordered linearized tree')
nodes = sorted(tree['nodes'].keys(), key=lambda node:tree['nodes'][node]['pred_order_id'])
print(list(map(lambda node: tree['nodes'][node]['lemma'], nodes)))