In [79]:
from bsddb3 import btopen
import bcolz, pickle
import numpy as np
from itertools import count
from collections import defaultdict

prefix = "../junk/Files/temp_threshold_3_4/temp"
dataset_file = "../junk/temp_dataset.tsv"
output_folder = "../junk/Output/"
embeddings_folder = "../junk/Glove.dat"
embeddings_file = "/Users/vivek/SIREN-Research/Archive-LSTM/glove.6B/glove.6B.300d.txt"
lr = 0.001
dropout = 0.3

# relations = ["hypernym", "hyponym", "synonym", "none"]
relations = ["True", "False"]
NUM_RELATIONS = len(relations)

In [80]:

def id2Entity(db, entity_id):
    entity_id = str(entity_id).encode("utf-8")
    return db[entity_id].decode("utf-8")

def entity2ID(db, entity):
    entity = entity.encode("utf-8")
    if entity in db:
        return int(db[entity])
    return -1

def extractPaths(db, x, y):
    key = (str(x) + '_' + str(y)).encode("utf-8")
    try:
        relation = db[key].decode("utf-8")
        return {int(path_count.split(":")[0]): int(path_count.split(":")[1]) for path_count in relation.split(",")}
    except Exception as e:
        return {}

def load_embeddings_from_disk():
    try:
        vectors = bcolz.open(embeddings_folder)[:]
        words = pickle.load(open(embeddings_folder + 'words.pkl', 'rb'))
        word2idx = pickle.load(open(embeddings_folder + 'words_index.pkl', 'rb'))

        embeddings = {w: vectors[word2idx[w]] for w in words}
    except:
        embeddings = create_embeddings()
    return embeddings, word2idx
        

def create_embeddings():
    words = ['_unk_']
    idx = 1
    word2idx = {"_unk_": 0}
    vectors = bcolz.carray(np.random.random(300), rootdir=embeddings_folder, mode='w')
    with open(embeddings_file, 'r') as f:
        for l in f:
            line = l.split()
            word, vector = line[0], line[1:]
            words.append(word)
            vectors.append(np.array(vector).astype(np.float))
            word2idx[word] = idx
            idx += 1
    vectors = vectors.reshape((-1, 300))
    row_norm = np.sum(np.abs(vectors)**2, axis=-1)**(1./2)
    vectors /= row_norm[:, np.newaxis]
    vectors = bcolz.carray(vectors, rootdir=embeddings_folder, mode='w')
    vectors.flush()

    pickle.dump(words, open(embeddings_folder + 'words.pkl', 'wb'))
    pickle.dump(word2idx, open(embeddings_folder + 'words_index.pkl', 'wb'))
    
    embeddings = {w: vectors[word2idx[w]] for w in words}
    return embeddings, word2idx

word2ID_db = btopen(prefix + "_word_to_id.db", "r")
ID2word_db = btopen(prefix + "_id_to_word.db", "r")
path2ID_db = btopen(prefix + "_path_to_id.db", "r")
ID2path_db = btopen(prefix + "_id_to_path.db", "r")
relations_db = btopen(prefix + "_word_occurence_map.db", "r")

embeddings, emb_indexer = load_embeddings_from_disk()

dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(dataset_file).read().split("\n")}

mappingDict = {key: idx for (idx,key) in enumerate(relations)}


In [92]:
arrowHeads = {">": "up", "<":"down"}

def extractDirection(edge):
    
    if edge[0] == ">" or edge[0] == "<":
        direction = "start_" + arrowHeads[edge[0]]
        edge = edge[1:]
    elif edge[-1] == ">" or edge[-1] == "<":
        direction = "end_" + arrowHeads[edge[-1]]
        edge = edge[:-1]
    else:
        direction = ' '
    return direction, edge
    
def parsePath(path):
    parsedPath = []
    for edge in path.split("_"):
        direction, edge = extractDirection(edge)
        if edge.split("/"):
            embedding, pos, dependency = edge.split("/")
            emb_idx, pos_idx, dep_idx, dir_idx = emb_indexer.get(embedding, 0), pos_indexer[pos], dep_indexer[dependency], dir_indexer[direction]
            parsedPath.append(tuple([emb_idx, pos_idx, dep_idx, dir_idx]))
        else:
            return None
    return tuple(parsedPath)

def extractAllPaths(x,y):
    
    paths = list(extractPaths(relations_db,x,y).items()) + list(extractPaths(relations_db,y,x).items())
    x_word, y_word = id2Entity(ID2word_db, x), id2Entity(ID2word_db, y)
    pathCountDict = { id2Entity(ID2path_db, path).replace("X/", x_word+"/").replace("Y/", y_word+"/") : freq for (path, freq) in paths }
    pathCountDict = { parsePath(path) : pathCountDict[path] for path in pathCountDict }

    return { path : pathCountDict[path] for path in pathCountDict if path}
    
def parseDataset(dataset):
    keys = [(entity2ID(word2ID_db, x), entity2ID(word2ID_db, y)) for (x, y) in dataset]
    paths = [extractAllPaths(x,y) for (x,y) in keys]
    empty = [list(dataset)[i] for i, path_list in enumerate(paths) if len(list(path_list.keys())) == 0]
    print('Pairs without paths:', len(empty), ', all dataset:', len(dataset))
    embed_indices = [(embeddings.get(x,0), embeddings.get(y,0)) for (x,y) in keys]
    return embed_indices, paths
    
pos_indexer, dep_indexer, dir_indexer = defaultdict(count(0).__next__), defaultdict(count(0).__next__), defaultdict(count(0).__next__)
unk_pos, unk_dep, unk_dir = pos_indexer["#UNKNOWN#"], dep_indexer["#UNKNOWN#"], dir_indexer["#UNKNOWN#"]


x = parseDataset(dataset.keys())
y = [mappingDict[relation] for relation in dataset.values()]


[{((38901, 1, 1, 1), (31, 2, 2, 2), (4045, 1, 3, 3), (5904, 2, 4, 4)): 1, ((38901, 1, 1, 1), (31, 2, 2, 2), (4045, 1, 3, 3)): 3}, {}, {}, {}, {}, {}]
Pairs without paths: 5 , all dataset: 6
