## Knowledge Graphs

Using Tensorflow and the Neural Structured Learning framework to automate construction of knowledge graphs from the relational word vectors

In [None]:
import numpy as np
import tensorflow as tf
import neural_structured_learning as nsl
import collections
from sklearn.utils import shuffle

In [None]:
#importing embeddings and vocab
folder = '/Users/vibhav/Desktop/rp-project-data'

ro_embeds = np.load(folder+'/ro_embeds.npy')
vi_embeds = np.load(folder+'/vi_embeds.npy')
mo_embeds = np.load(folder+'/mo_embeds.npy')

ro_vocab = np.load(folder+'/ro_vocab.npy')
vi_vocab = np.load(folder+'/vi_vocab.npy')
mo_vocab = np.load(folder+'/mo_vocab.npy')




In [None]:
#shuffling vectors and vocab for random sampling

ro_embeds, ro_vocab = shuffle(ro_embeds, ro_vocab)
vi_embeds, vi_vocab = shuffle(vi_embeds, vi_vocab)
mo_embeds, mo_vocab = shuffle(mo_embeds, mo_vocab)

#creating dictionary mapping of words to vectors

ro_vecs_train = collections.defaultdict()
ro_vecs_test = collections.defaultdict()

assert(len(ro_vocab) == len(ro_embeds))
for i in range(3500):
    ro_vecs_train[ro_vocab[i]] = ro_embeds[i]

for i in range(1500):
    ro_vecs_test[ro_vocab[i+3500]] = ro_embeds[i+3500]
    

vi_vecs_train = collections.defaultdict()
vi_vecs_test = collections.defaultdict()

assert(len(vi_vocab) == len(vi_embeds))
for i in range(3500):
    ro_vecs_train[vi_vocab[i]] = vi_embeds[i]

for i in range(1500):
    ro_vecs_test[vi_vocab[i+3500]] = vi_embeds[i+3500]

mo_vecs_train = collections.defaultdict()
mo_vecs_test = collections.defaultdict()

assert(len(mo_vocab) == len(mo_embeds))
for i in range(3500):
    ro_vecs_train[mo_vocab[i]] = mo_embeds[i]

for i in range(1500):
    ro_vecs_test[mo_vocab[i+3500]] = mo_embeds[i+3500]


In [None]:
#building graphs from embeddings
tensor = lambda l: tf.convert_to_tensor(l, dtype=tf.float32)
gbc = nsl.configs.GraphBuilderConfig(similarity_threshold=0.87, lsh_splits=1000, lsh_rounds=50)

def write_to_tfrecord(output_path, vecs):
    with tf.io.TFRecordWriter(output_path) as writer:
        for word in vecs.keys():
            key = tf.train.BytesList(value=[word.encode(encoding='cp1252')])
            vec = tf.train.FloatList(value=vecs[word])
            key_f = tf.train.Feature(bytes_list=key)
            vec_f = tf.train.Feature(float_list=vec)
            features = {'id': key_f, 'embedding': vec_f}
            example = tf.train.Example(features=tf.train.Features(feature=features))
            writer.write(example.SerializeToString())

write_to_tfrecord(folder+'/ro_t_exs.tfr', ro_vecs_train)
write_to_tfrecord(folder+'/vi_t_exs.tfr', vi_vecs_train)
write_to_tfrecord(folder+'/mo_t_exs.tfr', mo_vecs_train)

nsl.tools.build_graph_from_config([folder+'/ro_t_exs.tfr'], folder+'/ro_t_graph.tsv', gbc)
nsl.tools.build_graph_from_config([folder+'/vi_t_exs.tfr'], folder+'/vi_t_graph.tsv', gbc)
nsl.tools.build_graph_from_config([folder+'/mo_t_exs.tfr'], folder+'/mo_t_graph.tsv', gbc)
