In [None]:
import string
import numpy as np
import cPickle as pickle

# Import pruned word dictionary

In [None]:
f = open('word-equal-context_subsampled.p', 'rb')
info = pickle.load(f)
dictionary = info["word_pruned_dictionary"]
del info
f.close() 

# Read in and Process Dependencies to create Word-Context Pairs

In [None]:
# We will use the conllu parser to create a tree based on dependencies
import conllu
from conllu.parser import parse_tree

In [None]:
# We read in the conll file - separating each sentence/entry 
# All lines/words from the same sentence will be in one string
# Returns data: A list where each element is a string containing all lines from the conll file
# that belong to that sentence
def read_conll(file_name):
    with open(file_name) as f:
        data = []
        lines = []
        for line in f.readlines():
            if line == '\n':
                data.append(''.join(lines))
                lines = []
            else:
                lines.append(line.translate(None, string.punctuation).lower())
    f.close()
    return data

In [None]:
# return the word of a tree node if the word is in the word dictionary
# other wise, return 'UNK'
def get_word(tree_node):
    word = tree_node.data['form']
    if word in dictionary:
        return word
    else:
        return 'UNK'

In [None]:
# Return the context word with the dependency relationship modifier appended
# If word parameter is passed in, then this is an inverse relationship where the parent word is the context word
# We keep the modifier from the child and add -1 to demonstrate inverse.
# Other wise, we use the word and modifier from the node passed in - the child.
def get_word_with_context(child_node, word = None):
    modifier = child_node.data['deprel'] 
    if word != None:
        modifier = modifier + "/-1"
    else:
        word = get_word(child_node)
    return word + "/" + modifier

In [None]:
# Returns a list of all the tree node's children
def get_children(tree_node):
    return tree_node.children

In [None]:
# This is a recursive algorithm that traverses through the dependency tree and creates the word-context pairs
def process_tree(tree, parent = None):
    
    if parent != None: 
        word = get_word(tree)
        word_as_context = get_word_with_context(tree)
        parent_word = get_word(parent)
        parent_word_as_context = get_word_with_context(tree, word = get_word(parent))
        
        # print(parent_word, word_as_context)
        add_to_lists_and_dictionary(parent_word, word_as_context)
        # print(word, parent_word_as_context)
        add_to_lists_and_dictionary(word, parent_word_as_context)
        
    for child in get_children(tree):
        process_tree(child, tree)

In [None]:
# Given a word and context pair, the word is appended to the words list and the associated context is appended to the context list
# Instead of adding the actual words, we add their "index" which is saved in the dictionary so that we can have 
# one hot encodings in the neural network later
# To do this, we add the context word to the context dictionary if it is not their yet
def add_to_lists_and_dictionary(word, context):
    words.append(dictionary[word])
    if not context in context_dict:
        context_dict[context] = len(context_dict)
    contexts.append(context_dict[context])

In [None]:
data = read_conll('data/training/training-data.1m.conll')

In [None]:
words = []
contexts = []
context_dict = {}
for i in range(len(data)):
    tree = parse_tree(data[i])[0]
    process_tree(tree)

In [None]:
print len(contexts)
print len(words)

In [None]:
len(context_dict)

# Save to Pickle

In [None]:
f = open('dependency_contexts.p', 'wb')   # 'wb' instead 'w' for binary file
pickle.dump({"word_pruned_dictionary" : dictionary,
             "context_pruned_dictionary": context_dict,
             "train_data_context":contexts,
             "train_data_words":words}, f, -1)       # -1 specifies highest binary protocol
f.close() 