In [150]:
import spacy, subprocess, itertools, multiprocessing
from spacy.tokens.token import Token

MAX_PATH_LEN = 6


def stringifyEdge(word, root=True):
    try:
        w = word.root
    except:
        w = word

    if isinstance(word, Token):
        word = word.lemma_.strip().lower()
    else:
        word = ' '.join([wd.string.strip().lower() for wd in word])
    pos, deps = w.pos_, w.dep_
    path = '/'.join([word, pos, deps if deps and root else 'ROOT'])
    return path

def stringifyArg(word, edge):
    try:
        word = word.root
    except:
        pass
    pos, deps = word.pos_, word.dep_
    path = '/'.join([edge, pos, deps if deps else 'ROOT'])
    return path

def filterPaths(function, lowestCommonHead, paths):
    path1 = [lowestCommonHead]
    path1.extend(paths[:-1])
    path2 = paths
    return any(node not in function(path) for path, node in list(zip(path1, path2)))

def notPunct(arr):
    firstWord = arr[0]
    return firstWord.tag_ != 'PUNCT' and len(firstWord.string.strip()) > 1

def notEqual(x, y):
    try:
        return x!=y
    except:
        return False

def checkHead(token, lowestCommonHead):
    return isinstance(token, Token) and lowestCommonHead == token

def getPathFromRoot(phrase):
    paths = []
    head = phrase.head
    while phrase != head:
        phrase = phrase.head
        paths.append(phrase)
        head = phrase.head
    paths = paths[::-1]
    return paths

def breakCompoundWords(elem):
    try:
        root = elem.root
        return root
    except:
        return elem

def findMinLength(x, y):
    if len(x) < len(y):
        return (len(x), x)
    return (len(y), y)

def findLowestCommonHead(pathX, pathY, minLength, minArray):
    lowestCommonHead = None
    if minLength:        
        uncommon = [i for i in range(minLength) if pathX[i] != pathY[i]]
        if uncommon:
            idx = uncommon[0] - 1
        else:
            idx = minLength - 1
        lowestCommonHead = minArray[idx]
    else:
        idx = 0
        if pathX:
            lowestCommonHead = pathX[0]
        elif pathY:
            lowestCommonHead = pathY[0]
        else:
            lowestCommonHead = None
    
    return idx, lowestCommonHead

def getShortestPath(tup):

    xinit, yinit = tup[0], tup[1]

    x, y = breakCompoundWords(xinit), breakCompoundWords(yinit)
    
    pathX, pathY = getPathFromRoot(x), getPathFromRoot(y)
    
    minLength, minArray = findMinLength(pathX, pathY)
    
    idx, lowestCommonHead = findLowestCommonHead(pathX, pathY, minLength, minArray)
    
    try:
        pathX = pathX[idx+1:]
        pathY = pathY[idx+1:]
        checkLeft, checkRight = lambda h: h.lefts, lambda h: h.rights
        if lowestCommonHead and (filterPaths(checkLeft, lowestCommonHead, pathX) or filterPaths(checkRight, lowestCommonHead, pathY)):
            return None
        pathX = pathX[::-1]

        paths = [(None, xinit, pathX, lowestCommonHead, pathY, yinit, None)]
        lefts, rights = list(xinit.lefts), list(yinit.rights)

        if lefts and notPunct(lefts):
            paths.append((lefts[0], xinit, pathX, lowestCommonHead, pathY, yinit, None))

        if rights and notPunct(rights):
            paths.append((None, xinit, pathX, lowestCommonHead, pathY, yinit, rights[0]))
        
        return paths
    except Exception as e:
        print (e)
        return None

def stringifyFilterPath(path):

    lowestCommonHeads = []
    (leftX, x, pathX, lowestCommonHead, pathY, y, rightY) = path

    isXHead, isYHead = checkHead(x, lowestCommonHead), checkHead(y, lowestCommonHead)
    signX = '' if isXHead else '>'
    leftXPath  = []
    if leftX:
        edge_str = stringifyEdge(leftX)
        leftXPath.append(edge_str + "<")

    signY = '' if isYHead else '<'
    rightYPath = []
    if rightY:
        edge_str = stringifyEdge(rightY)
        rightYPath.append(">" + edge_str)

    lowestCommonHeads = [[stringifyEdge(lowestCommonHead, False)] if lowestCommonHead and not (isYHead or isXHead) else []][0]
    
    if MAX_PATH_LEN >= len(pathX + leftXPath + pathY + rightYPath + lowestCommonHeads):
        
        if isinstance(x, Token):
            stringifiedX = x.string.strip().lower()
        else:
            stringifiedX = ' '.join([x_wd.string.strip().lower() for x_wd in x])
        
        if isinstance(y, Token):
            stringifiedY = y.string.strip().lower()
        else:
            stringifiedY = ' '.join([y_wd.string.strip().lower() for y_wd in y])

        stringifiedPathX, stringifiedPathY = [stringifyEdge(word) + ">" for word in pathX], ["<" + stringifyEdge(word) for word in pathY]
        stringifiedArgX, stringifiedArgY = [stringifyArg(x, 'X') + signX], [signY + stringifyArg(y, 'Y')]
        
        stringifiedPath = '_'.join(leftXPath + stringifiedArgX + stringifiedPathX + lowestCommonHeads + stringifiedPathY + stringifiedArgY + rightYPath)

        return (stringifiedX, stringifiedY, stringifiedPath)

    return None

def getDependencyPaths(sentence, nlp, sentenceNounChunks):

    nps = [(n, n.start, n.end) for n in sentenceNounChunks]
    nps.extend([(word, pos, pos) for (pos, word) in enumerate(sentence) if word.tag_[:2] == 'NN' and len(word.string.strip()) > 2])
    ls = list(itertools.product(nps, nps))
    pairedConcepts = [(el[0][0], el[1][0]) for el in itertools.product(nps, nps) if el[1][1] > el[0][2] and notEqual(el[0], el[1])]
    pairedConcepts = list(dict.fromkeys(pairedConcepts))
    
    paths = []
    for pair in pairedConcepts:
        appendingElem = getShortestPath(pair)
        if appendingElem:
            paths.extend([stringifyFilterPath(path) for path in appendingElem])

    return paths

def splitFile (file, n):
    inputfile = open(file, 'r')
    output = None
    suffix = 0
    for (i, line) in enumerate(inputfile):
        if i % n == 0:
            if output:
                output.close()
            output = open(file + "_split_" + str(suffix) + '.txt', 'w+')
            suffix += 1
        output.write(line)
    output.close()
    return suffix

def parseText(idx):
    global file

    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(nlp.create_pipe('sentencizer'), before="parser")
    fileName = file + "_split_" + str(idx) + ".txt"
    op = file + "_parsed_" + str(idx)

    with open(fileName, "r") as inp:
        with open(op, "w+") as out:
            for i,para in enumerate(inp):
                if not para.strip(): continue
                nounChunks = list(nlp(para).nounChunks).copy()
                sentences = nlp(para.strip()).sents
                for sentence in sentences:
                    if "<doc id=" in sentence.text or "</doc>" in sentence.text:
                        continue
                    sentenceNounChunks = [n for n in nounChunks if sentence.start <= n.start < n.end - 1 < sentence.end]
                    dependencies = getDependencyPaths(sentence, nlp, sentenceNounChunks)
                    if dependencies:
                        allpaths = ["\t".join(path) for path in dependencies if path]
                        out.write("\n".join(allpaths))


if __name__ == "__main__":    
    file = "../junk/temp"
    countlines = "wc -l " + file 
    output, _ = subprocess.Popen(countlines.split(), stdout=subprocess.PIPE).communicate()
    n = int(output.decode("utf-8").strip().split(" ")[0]) + 1
    m = int(n/20)
    suffix = splitFile(file, m)
    processes = []
    for i in range(20):
        p = multiprocessing.Process(target=parseText, args=(i,))
        processes.append(p)
        p.start()

    for p in processes:
        p.join()
    
    mergeParsedOutput = "cat " + file + "_parsed_* > parsed_paths"  
    output, _ = subprocess.Popen(mergeParsedOutput.split(), shell=True).communicate()

hi
done


In [5]:
splitFileName = "file_split_01".split("_")
file = "_".join(splitFileName[:-1]) + "_" + ("0" + splitFileName[-1] if len(splitFileName[-1]) == 1 else  splitFileName[-1])

In [17]:

l2r = btopen(paths_folder + "/" + prefix + '_l2r.db', 'c')

with open(file) as inp:
    for line in inp:
        x, y, path, count = line.strip().split('\t')

        key = str(x) + '_' + str(y)
        current = path + ":" + count

        if key in l2r:
            pastkeys = l2r[key].decode('utf-8')
            current =  pastkeys + current
        
        current = current.encode("utf-8")
        key = key.encode("utf-8")
        
        l2r[key] = current

l2r.sync()



DocoptExit: Usage:
        create_resource_from_corpus_2.py <triplet_file> <resource_prefix>

In [7]:

from collections import defaultdict
from itertools import count

def somefn(pos_index):
    randomstr = "ok"
    return pos_index[randomstr]

pos_index = defaultdict(count(0).__next__)
somefn(pos_index), pos_index



(0,
 defaultdict(<method-wrapper '__next__' of itertools.count object at 0x10db93f48>,
             {'ok': 0}))

In [77]:
corpus_prefix = "../junk/Files/temp_threshold_3_4/temp"

import bsddb3

def vectorize_path(path, lemma_index, pos_index, dep_index, dir_index):
    """
    Return a vector representation of the path
    :param path:
    :param lemma_index:
    :param pos_index:
    :param dep_index:
    :param dir_index:
    :return:
    """
    path_edges = [vectorize_edge(edge, lemma_index, pos_index, dep_index, dir_index) for edge in path.split('_')]
    if None in path_edges:
        return None
    else:
        return tuple(path_edges)


def vectorize_edge(edge, lemma_index, pos_index, dep_index, dir_index):
    """
    Return a vector representation of the edge: concatenate lemma/pos/dep and add direction symbols
    :param edge:
    :param lemma_index:
    :param pos_index:
    :param dep_index:
    :param dir_index:
    :return:
    """
    direction = ' '

    # Get the direction
    if edge.startswith('<') or edge.startswith('>'):
        direction = 's' + edge[0]
        edge = edge[1:]
    elif edge.endswith('<') or edge.endswith('>'):
        direction = 'e' + edge[-1]
        edge = edge[:-1]

    try:
        lemma, pos, dep = edge.split('/')
        print (lemma)
    except:
        return None

    return tuple([lemma_index.get(lemma, 0), pos_index[pos], dep_index[dep], dir_index[direction]])

def get_paths(corpus, x, y):
    """
    Get the paths that connect x and y in the corpus
    :param corpus: the corpus' resource object
    :param x:
    :param y:
    :return:
    """
    x_to_y_paths = corpus.get_relations(x, y)
    y_to_x_paths = corpus.get_relations(y, x)
    x_term = corpus.get_term_by_id(x)
    y_term = corpus.get_term_by_id(y)
#     print ([type(corpus.get_path_by_id(path)) for (path, count) in x_to_y_paths.items()], type(x_term), type(y_term))
    paths = { corpus.get_path_by_id(path).replace("X/", x_term+"/").replace("Y/", y_term+"/") : count for (path, count) in x_to_y_paths.items() }
    paths.update({ corpus.get_path_by_id(path).replace("X/", y_term+"/").replace("Y/", x_term+"/") : count
                   for (path, count) in y_to_x_paths.items() })
    return paths

class KnowledgeResource:
    """
    Holds the resource graph data
    """
    def __init__(self, resource_prefix):
        """
        Init the knowledge resource
        :param resource_prefix - the resource directory and file prefix
        """
        self.term_to_id = bsddb3.btopen(resource_prefix + '_word_to_id.db', 'r')
        self.id_to_term = bsddb3.btopen(resource_prefix + '_id_to_word.db', 'r')
        self.path_to_id = bsddb3.btopen(resource_prefix + '_path_to_id.db', 'r')
        self.id_to_path = bsddb3.btopen(resource_prefix + '_id_to_path.db', 'r')
        self.l2r_edges = bsddb3.btopen(resource_prefix + '_word_occurence_map.db', 'r')

    def get_term_by_id(self, id):
        return self.id_to_term[str(id).encode("utf-8")].decode("utf-8")

    def get_path_by_id(self, id):
        return self.id_to_path[str(id).encode("utf-8")].decode("utf-8")

    def get_id_by_term(self, term):
        return int(self.term_to_id[term.encode("utf-8")]) if self.term_to_id.has_key(term.encode("utf-8")) else -1

    def get_id_by_path(self, path):
        return int(self.path_to_id[path.encode("utf-8")]) if self.path_to_id.has_key(path.encode("utf-8")) else -1

    def get_relations(self, x, y):
        """
        Returns the relations from x to y
        """
        path_dict = {}
        key = str(x) + '_' + str(y)
        path_str = self.l2r_edges[key.encode("utf-8")].decode("utf-8") if self.l2r_edges.has_key(key.encode("utf-8")) else ''

        if len(path_str) > 0:
#             print (path_str)
#             print ([p for p in path_str.split(',')])
            paths = [tuple([int(x) for x in p.split(':')]) for p in path_str.split(',')]
            path_dict = { path : count for (path, count) in paths }

        return path_dict

    
corpus = KnowledgeResource(corpus_prefix)
dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open("../junk/temp_dataset.tsv").read().split("\n")}


In [79]:
pos_index = defaultdict(count(0).__next__)
dep_index = defaultdict(count(0).__next__)
dir_index = defaultdict(count(0).__next__)
lemma_index = defaultdict(count(0).__next__)

keys = [(corpus.get_id_by_term(str(x)), corpus.get_id_by_term(str(y))) for (x, y) in dataset]
paths_x_to_y = [{ vectorize_path(path, lemma_index, pos_index, dep_index, dir_index) : count
                      for path, count in get_paths(corpus, x_id, y_id).items() }
                    for (x_id, y_id) in keys]

anarchism
be
philosophy
reject
anarchism
be
philosophy


In [80]:
paths_x_to_y

[{((0, 0, 0, 0), (0, 1, 1, 1), (0, 0, 2, 2), (0, 1, 3, 3)): 1,
  ((0, 0, 0, 0), (0, 1, 1, 1), (0, 0, 2, 2)): 3},
 {},
 {},
 {},
 {},
 {}]

In [81]:
paths_x_to_y = [ { p : c for p, c in paths_x_to_y[i].items() if p is not None } for i in range(len(keys)) ]
paths_x_to_y

[{((0, 0, 0, 0), (0, 1, 1, 1), (0, 0, 2, 2), (0, 1, 3, 3)): 1,
  ((0, 0, 0, 0), (0, 1, 1, 1), (0, 0, 2, 2)): 3},
 {},
 {},
 {},
 {},
 {}]

In [82]:
NUM_LAYERS = 2
HIDDEN_DIM = 60
LEMMA_DIM = 300
POS_DIM = 4
DEP_DIM = 5
DIR_DIM = 1

from dynet import *
model = Model()
network_input = HIDDEN_DIM

builder = LSTMBuilder(NUM_LAYERS, LEMMA_DIM + POS_DIM + DEP_DIM + DIR_DIM, network_input, model)

In [86]:
model_parameters = {}
model.add_parameters((4, 60))


_dynet.ParameterCollection