In [1]:
import pickle, pickledb
import numpy as np
from itertools import count
from collections import defaultdict
import tensorflow as tf
import tensorflow_hub as hub

train_file = "/data/Vivek/original/HypeNET/dataset/custom_train_0.0_0.05.tsv"
test_file =  "/data/Vivek/original/HypeNET/dataset/custom_test_0.0_0.05.tsv"
instances_file = '../files/dataset/test_instances.tsv'
knocked_file = '../files/dataset/test_knocked.tsv'

NULL_PATH = ((0, 0, 0, 0),)
relations = ["hypernym", "hyponym", "concept", "instance", "none"]
NUM_RELATIONS = len(relations)
prefix = "../junk/db_files/"

USE_link = "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed"
model = hub.load(USE_link)

f = open("../junk/resolved_use.pkl", "rb")
resolved = pickle.load(f)

def extractUSEEmbeddings(words):
    word_embeddings = model(words)
    return word_embeddings.numpy()

In [2]:
arrow_heads = {">": "up", "<":"down"}


def extract_direction(edge):

    if edge[0] == ">" or edge[0] == "<":
        direction = "start_" + arrow_heads[edge[0]]
        edge = edge[1:]
    elif edge[-1] == ">" or edge[-1] == "<":
        direction = "end_" + arrow_heads[edge[-1]]
        edge = edge[:-1]
    else:
        direction = ' '
    return direction, edge

def parse_path(path):
    parsed_path = []
    for edge in path.split("*##*"):
        direction, edge = extract_direction(edge)
        if edge.split("/"):
            try:
                embedding, pos, dependency = tuple([a[::-1] for a in edge[::-1].split("/",2)][::-1])
            except:
                print (edge, path)
                raise
            emb_idx, pos_idx, dep_idx, dir_idx = emb_indexer[embedding], pos_indexer[pos], dep_indexer[dependency], dir_indexer[direction]
            parsed_path.append(tuple([emb_idx, pos_idx, dep_idx, dir_idx]))
        else:
            return None
    return tuple(parsed_path)

def parse_tuple(tup):
    x, y = [entity_to_id(word2id_db, elem) for elem in tup]
    paths_x, paths_y = list(extract_paths(relations_db,x,y).items()), list(extract_paths(relations_db,y,x).items())
    path_count_dict_x = { id_to_path(id2path_db, path).replace("X/", tup[0]+"/").replace("Y/", tup[1]+"/") : freq for (path, freq) in paths_x }
    path_count_dict_y = { id_to_path(id2path_db, path).replace("Y/", tup[0]+"/").replace("X/", tup[1]+"/") : freq for (path, freq) in paths_y }
    path_count_dict = {**path_count_dict_x, **path_count_dict_y}
    return path_count_dict

def parse_dataset(dataset):
    parsed_dicts = [parse_tuple(tup) for tup in dataset.keys()]
    parsed_dicts = [{ parse_path(path) : path_count_dict[path] for path in path_count_dict } for path_count_dict in parsed_dicts]
    paths = [{ path : path_count_dict[path] for path in path_count_dict if path} for path_count_dict in parsed_dicts]
    paths = [{NULL_PATH: 1} if not path_list else path_list for i, path_list in enumerate(paths)]
    counts = [list(path_dict.values()) for path_dict in paths]
    paths = [list(path_dict.keys()) for path_dict in paths]
    targets = [rel_indexer[relation] for relation in dataset.values()]
    return paths, counts, targets

failed, success = [], []

def id_to_entity(db, entity_id):
    entity = db[str(entity_id)]
    return entity

def id_to_path(db, entity_id):
    entity = db[str(entity_id)]
    entity = "/".join(["*##*".join(e.split("_", 1)) for e in entity.split("/")])
    return entity

def entity_to_id(db, entity):
    global success, failed
    entity_id = db.get(entity)
    if entity_id:
        success.append(entity)
        return int(entity_id)
    closest_entity = resolved.get(entity, "")
    if closest_entity and closest_entity[0] and float(closest_entity[1]) > threshold:
        success.append(entity)
        return int(db[closest_entity[0]])
    failed.append(entity)
    return -1

def extract_paths(db, x, y):
    key = (str(x) + '###' + str(y))
    try:
        relation = db[key]
        return {int(path_count.split(":")[0]): int(path_count.split(":")[1]) for path_count in relation.split(",")}
    except Exception as e:
        return {}

word2id_db = pickledb.load(prefix + "w2i.db", False)
id2word_db = pickledb.load(prefix + "i2w.db", False)
path2id_db = pickledb.load(prefix + "p2i.db", False)
id2path_db = pickledb.load(prefix + "i2p.db", False)
relations_db = pickledb.load(prefix + "relations.db", False)


In [4]:
threshold = 0.8

emb_indexer, pos_indexer, dep_indexer, dir_indexer = [defaultdict(count(0).__next__) for i in range(4)]
unk_emb, unk_pos, unk_dep, unk_dir = emb_indexer["<UNK>"], pos_indexer["<UNK>"], dep_indexer["<UNK>"], dir_indexer["<UNK>"]
rel_indexer = {key: idx for (idx,key) in enumerate(relations)}

train_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(train_file).read().split("\n")}
test_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(test_file).read().split("\n")}
test_instances = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(instances_file).read().split("\n")}
test_knocked = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(knocked_file).read().split("\n")}

paths_train, counts_train, targets_train = parse_dataset(train_dataset)
paths_test, counts_test, targets_test  = parse_dataset(test_dataset)
paths_instances, counts_instances, targets_instances  = parse_dataset(test_instances)
paths_knocked, counts_knocked, targets_knocked  = parse_dataset(test_knocked)

nodes_train = [(emb_indexer[tup[0]], emb_indexer[tup[1]]) for tup in train_dataset]
nodes_test = [(emb_indexer[tup[0]], emb_indexer[tup[1]]) for tup in test_dataset]
nodes_instances = [(emb_indexer[tup[0]], emb_indexer[tup[1]]) for tup in test_instances]
nodes_knocked = [(emb_indexer[tup[0]], emb_indexer[tup[1]]) for tup in test_knocked]

print ("Train len: {}, Test len: {}, Instance len: {}, Knocked len: {}".format(len(paths_train), len(paths_test),  len(paths_instances), len(paths_knocked)))

emb_indexer_inv = {emb_indexer[key]: key for key in emb_indexer}
embeds = extractUSEEmbeddings(list(emb_indexer.keys())[1:])
emb_vals = np.array(np.zeros((1, embeds.shape[1])).tolist() + embeds.tolist())


output_file = "../Input/data_use_" + str(threshold) + ".pkl"
f = open(output_file, "wb+")
pickle.dump([nodes_train, paths_train, counts_train, targets_train, 
             nodes_test, paths_test, counts_test, targets_test,
             nodes_instances, paths_instances, counts_instances, targets_instances,
             nodes_knocked, paths_knocked, counts_knocked, targets_knocked,
             emb_indexer, emb_indexer_inv, emb_vals, 
             pos_indexer, dep_indexer, dir_indexer], f)
f.close()



Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538


In [5]:
parsed_train[0]

[{((0, 0, 0, 0),): 1},
 {((0, 0, 0, 0),): 1},
 {((0, 0, 0, 0),): 1},
 {((0, 0, 0, 0),): 1},
 {((1, 1, 1, 1), (2, 2, 2, 2), (3, 1, 1, 3)): 1,
  ((3, 3, 3, 1), (4, 2, 2, 2), (1, 1, 4, 3), (2, 2, 5, 4)): 1,
  ((3, 3, 3, 1), (4, 2, 2, 2), (1, 1, 4, 3)): 1},
 {((0, 0, 0, 0),): 1},
 {((0, 0, 0, 0),): 1},
 {((0, 0, 0, 0),): 1},
 {((0, 0, 0, 0),): 1},
 {((0, 0, 0, 0),): 1},
 {((5, 1, 3, 1), (4, 2, 2, 2), (6, 1, 4, 3), (7, 4, 6, 3), (8, 1, 7, 3)): 1,
  ((5, 1, 7, 1), (9, 4, 2, 2), (5, 1, 7, 3), (10, 4, 6, 3), (8, 1, 7, 3)): 1,
  ((5, 3, 3, 1), (4, 2, 2, 2), (8, 1, 4, 3), (11, 4, 6, 4)): 1,
  ((5, 1, 3, 1), (12, 2, 2, 2), (8, 1, 1, 3)): 1,
  ((5, 1, 7, 1), (13, 4, 6, 1), (14, 2, 2, 2), (8, 1, 3, 3)): 2,
  ((15, 5, 8, 5), (5, 1, 3, 1), (16, 2, 2, 2), (17, 4, 6, 3), (8, 1, 7, 3)): 1,
  ((5, 1, 9, 1),
   (18, 2, 10, 1),
   (19, 1, 2, 2),
   (10, 4, 6, 3),
   (8, 3, 7, 3)): 1,
  ((5, 1, 11, 1), (20, 1, 2, 2), (11, 4, 6, 3), (8, 1, 7, 3)): 1,
  ((5, 1, 7, 1), (10, 4, 2, 2), (5, 1, 7, 3), (10, 4, 6, 3