### Important

In [1]:
data_name = 'fb237_v4'
model_id = 'main_9'

In [2]:
#difine the names for saving
model_name = 'Model_' + model_id + '_' + data_name
one_hop_model_name = 'One_hop_model_' + model_id + '_' + data_name
ids_name = 'IDs_' + model_id + '_' + data_name

In [3]:
import librosa
import opensmile
import os
import sys
import numpy as np
import random
import pickle

from collections import defaultdict
from copy import deepcopy
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras import initializers
from tensorflow.keras.utils import plot_model

In [4]:
class LoadKG:
    
    def __init__(self):
        
        self.x = 'Hello'
        
    def load_train_data(self, data_path, one_hop, data, s_t_r, entity2id, id2entity,
                     relation2id, id2relation):
        
        data_ = set()
    
        ####load the train, valid and test set##########
        with open (data_path, 'r') as f:
            
            data_ini = f.readlines()
                        
            for i in range(len(data_ini)):
            
                x = data_ini[i].split()
                
                x_ = tuple(x)
                
                data_.add(x_)
        
        ####relation dict#################
        index = len(relation2id)
     
        for key in data_:
            
            if key[1] not in relation2id:
                
                relation = key[1]
                
                relation2id[relation] = index
                
                id2relation[index] = relation
                
                index += 1
                
                #the inverse relation
                iv_r = '_inverse_' + relation
                
                relation2id[iv_r] = index
                
                id2relation[index] = iv_r
                
                index += 1
        
        #get the id of the inverse relation, by above definition, initial relation has 
        #always even id, while inverse relation has always odd id.
        def inverse_r(r):
            
            if r % 2 == 0: #initial relation
                
                iv_r = r + 1
            
            else: #inverse relation
                
                iv_r = r - 1
            
            return(iv_r)
        
        ####entity dict###################
        index = len(entity2id)
        
        for key in data_:
            
            source, target = key[0], key[2]
            
            if source not in entity2id:
                                
                entity2id[source] = index
                
                id2entity[index] = source
                
                index += 1
            
            if target not in entity2id:
                
                entity2id[target] = index
                
                id2entity[index] = target
                
                index += 1
                
        #create the set of triples using id instead of string        
        for ele in data_:
            
            s = entity2id[ele[0]]
            
            r = relation2id[ele[1]]
            
            t = entity2id[ele[2]]
            
            if (s,r,t) not in data:
                
                data.add((s,r,t))
            
            s_t_r[(s,t)].add(r)
            
            if s not in one_hop:
                
                one_hop[s] = dict()
            
            if r not in one_hop[s]:
                
                one_hop[s][r] = set()
            
            one_hop[s][r].add(t)
            
            if t not in one_hop:
                
                one_hop[t] = dict()
            
            r_inv = inverse_r(r)
            
            s_t_r[(t,s)].add(r_inv)
            
            if r_inv not in one_hop[t]:
                
                one_hop[t][r_inv] = set()
            
            one_hop[t][r_inv].add(s)

In [5]:
class ObtainPathsByDynamicProgramming:

    def __init__(self, size_bd=50, threshold=500000):
                
        self.size_bd = size_bd #size bound limit the number of paths to a target entity t
        
        #number of times paths with specific length been performed for recursion
        self.threshold = threshold
        
    '''
    Given an entity s, the function will find the paths from s to other entities, using recursion.
    
    One may refer to LeetCode Problem 797 for details:
        https://leetcode.com/problems/all-paths-from-source-to-target/
    '''
    def obtain_paths(self, mode, s, t_input, lower_bd, upper_bd, one_hop):

        if type(lower_bd) != type(1) or lower_bd < 1:
            
            raise TypeError("!!! invalid lower bound setting, must >= 1 !!!")
            
        if type(upper_bd) != type(1) or upper_bd < 1:
            
            raise TypeError("!!! invalid upper bound setting, must >= 1 !!!")
            
        if lower_bd > upper_bd:
            
            raise TypeError("!!! lower bound must not exced upper bound !!!")
            
        if s not in one_hop:
            
            raise ValueError('!!! entity not in one_hop. Please work on existing entities')
        
        #here is the result dict. Its key is each entity t sharing paths from s
        #The value of each t is a set containing the paths from s to t
        #These paths can be either the direct connection r, or a multi-hop path
        res = defaultdict(set)
        
        #qualified_t contains the types of t we want to consider,
        #that is, what t will be added to the result set.
        qualified_t = set()
        
        #under this mode, we will only consider the direct neighbour of s
        if mode == 'direct_neighbour':
        
            for r in one_hop[s]:
            
                for t in one_hop[s][r]:
                
                    qualified_t.add(t)
        
        #under this mode, we will only consider one specified entity t
        elif mode == 'target_specified':
            
            qualified_t.add(t_input)
        
        #under this mode, we will consider any entity
        elif mode == 'any_target':
            
            for s_any in one_hop:
                
                qualified_t.add(s_any)
                
        else:
            
            raise ValueError('not a valid mode')
        
        '''
        We use recursion to find the paths
        On current node with the path [r1, ..., rk] and on-path entities {s, e1, ..., ek-1, node}
        from s to this node, we will further find the direct neighbor t' of this node. 
        If t' is not an on-path entity (not among s, e1,...ek-1, node), we recursively proceed to t' 
        '''
        def helper(node, path, on_path_en, res, qualified_t, lower_bd, upper_bd, one_hop, count_dict):

            #when the current path is within lower_bd and upper_bd, 
            #and the node is among the qualified t, and it has not been fill of paths w.r.t size_limit,
            #we will add this path to the node
            if (len(path) >= lower_bd) and (len(path) <= upper_bd) and (
                node in qualified_t) and (len(res[node]) < self.size_bd):
                
                res[node].add(tuple(path))
                    
            #won't start new recursions if the current path length already reaches upper limit
            #or the number of recursions performed on this length has reached the limit
            if (len(path) < upper_bd) and (count_dict[len(path)] <= self.threshold):
         
                #find all the directly connected (r,t) pair to this current node
                #these pairs indicates potentially the next recursion
                potential_pair = set()

                for r in one_hop[node]:

                    for t in one_hop[node][r]:

                        potential_pair.add((r,t))

                potential_pair = list(potential_pair)
                
                random.shuffle(potential_pair)
                                
                for Tuple in potential_pair:
                    
                    r, t = Tuple[0], Tuple[1]
                    
                    count_dict[len(path)] += 1
                    
                    #if t not on the path, then finally proceed to next recursion
                    if (t not in on_path_en) and (count_dict[len(path)] <= self.threshold):

                        helper(t, path + [r], on_path_en.union({t}), res, qualified_t, 
                               lower_bd, upper_bd, one_hop, count_dict)
        
        length_dict = defaultdict(int)
        count_dict = defaultdict(int)
        
        helper(s, [], {s}, res, qualified_t, lower_bd, upper_bd, one_hop, count_dict)
        
        return(res, count_dict)

In [6]:
train_path = '../data/' + data_name + '/train.txt'

In [7]:
#load the classes
Class_1 = LoadKG()
Class_2 = ObtainPathsByDynamicProgramming()

In [8]:
#define the dictionaries and sets for load KG
one_hop = dict() 
data = set()
s_t_r = defaultdict(set)
entity2id = dict()
id2entity = dict()
relation2id = dict()
id2relation = dict()

#fill in the sets and dicts
Class_1.load_train_data(train_path, one_hop, data, s_t_r,
                        entity2id, id2entity, relation2id, id2relation)

#### Build the path-based siamese neural network structure

We use biLSTM to train on the input path embedding sequence to predict the output embedding or the relation.

In [9]:
# Input layer, using integer to represent each relation type
#note that inputs_path is the path inputs, while inputs_out_re is the output relation inputs
fst_path = keras.Input(shape=(None,), dtype="int32")
scd_path = keras.Input(shape=(None,), dtype="int32")
thd_path = keras.Input(shape=(None,), dtype="int32")

#the relation input layer (for output embedding)
id_rela = keras.Input(shape=(None,), dtype="int32")

# Embed each integer in a 300-dimensional vector as input,
# note that we add another "space holder" embedding, 
# which hold the spaces if the initial length of two paths are not the same
in_embd_var = layers.Embedding(len(relation2id)+1, 300)

# Obtain the embedding
fst_p_embd = in_embd_var(fst_path)
scd_p_embd = in_embd_var(scd_path)
thd_p_embd = in_embd_var(thd_path)

# Embed each integer in a 300-dimensional vector as output
rela_embd = layers.Embedding(len(relation2id)+1, 300)(id_rela)

#add 2 layer bi-directional LSTM
lstm_layer_1 = layers.Bidirectional(layers.LSTM(150, return_sequences=True))
lstm_layer_2 = layers.Bidirectional(layers.LSTM(150, return_sequences=True))

#first LSTM layer
fst_lstm_mid = lstm_layer_1(fst_p_embd)
scd_lstm_mid = lstm_layer_1(scd_p_embd)
thd_lstm_mid = lstm_layer_1(thd_p_embd)

#second LSTM layer
fst_lstm_out = lstm_layer_2(fst_lstm_mid)
scd_lstm_out = lstm_layer_2(scd_lstm_mid)
thd_lstm_out = lstm_layer_2(thd_lstm_mid)

#reduce max
fst_reduce_max = tf.reduce_max(fst_lstm_out, axis=1)
scd_reduce_max = tf.reduce_max(scd_lstm_out, axis=1)
thd_reduce_max = tf.reduce_max(thd_lstm_out, axis=1)

#concatenate the output vector from both siamese tunnel: (Batch, 900)
path_concat = layers.concatenate([fst_reduce_max, scd_reduce_max, thd_reduce_max], axis=-1)

#add dropout on top of the concatenation from all channels
dropout = layers.Dropout(0.25)(path_concat)

#multiply into output embd size by dense layer: (Batch, 300)
path_out_vect = layers.Dense(300, activation='tanh')(dropout)

#remove the time dimension from the output embd since there is only one step
rela_out_embd = tf.reduce_sum(rela_embd, axis=1)

# Normalize the vectors to have unit length
path_out_vect_norm = tf.math.l2_normalize(path_out_vect, axis=-1)
rela_out_embd_norm = tf.math.l2_normalize(rela_out_embd, axis=-1)

# Calculate the dot product
dot_product = layers.Dot(axes=-1)([path_out_vect_norm, rela_out_embd_norm])

#put together the model
model = keras.Model([fst_path, scd_path, thd_path, id_rela], dot_product)

2023-02-22 23:05:55.317059: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
#config the Adam optimizer 
opt = keras.optimizers.Adam(learning_rate=0.0005, decay=1e-6)

#compile the model
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['binary_accuracy'])

#### Build the one-hop basic network when path cannot be found

In [50]:
#each input is an vector with number of relations to be dim:
#each dim represent the existence (1) or not (0) of an out-going relation from the entity
source_input_ = keras.Input(shape=(len(relation2id),), dtype="float32")
target_input_ = keras.Input(shape=(len(relation2id),), dtype="float32")

#the relation input layer (for output embedding)
id_rela_ = keras.Input(shape=(None,), dtype="int32")

# Embed each integer in a 300-dimensional vector as output
rela_embd_ = layers.Embedding(len(relation2id)+1, 100)(id_rela_)

#remove the time dimension from the output embd since there is only one step
rela_out_embd_ = tf.reduce_sum(rela_embd_, axis=1)

#concatenate the source and target inputs
vec_concat_ = layers.concatenate([source_input_, target_input_], axis=-1)

#apply dense layers
#multiply into output embd size by dense layer: (Batch, 300)
dense_1_ = layers.Dense(256, activation='relu')(vec_concat_)

#apply dense layers
#multiply into output embd size by dense layer: (Batch, 300)
dense_2_ = layers.Dense(256, activation='relu')(dense_1_)
#add dropout on top of the concatenation from all channels
dropout_ = layers.Dropout(0.25)(dense_2_)

#apply dense layers
#multiply into output embd size by dense layer: (Batch, 300)
dense_3_ = layers.Dense(100, activation='tanh')(dropout_)

# Normalize the vectors to have unit length
dense_norm_ = tf.math.l2_normalize(dense_3_, axis=-1)
rela_out_embd_norm_ = tf.math.l2_normalize(rela_out_embd_, axis=-1)

# Calculate the dot product
dot_product_ = layers.Dot(axes=-1)([dense_norm_, rela_out_embd_norm_])

#put together the model
model_2 = keras.Model([source_input_, target_input_, id_rela_], dot_product_)

In [51]:
#config the Adam optimizer 
opt_ = keras.optimizers.Adam(learning_rate=0.0005, decay=1e-6)

#compile the model
model_2.compile(loss='binary_crossentropy', optimizer=opt_, metrics=['binary_accuracy'])

### Build the big-batch for path-based model
We will build the big-batch for the path-based model training. That is, we will build three list to store three paths, respectively. 
* At each step, three different paths between two entities s and t are selected. Each path is append to one of the list. 
* If this step is for positive samples, the existing relation r will be selected between s and t. If there are more than one relation from s to t, we randomly choose one. Also, the label list will be appended [1,0].
* If this step is for negative samples, one relation that does not exist between s and t will be selected randomly and append to the relation list. Also, the label list will be appended [0,1].
* In practice, the positive step is always fallowed by a negative step. The same paths in the positive step will be used in the next negative step, while the relation is a negative one chosen in the above way.
* We do this until the length limit is reached.

In [13]:
#function to build the big batche for path-based training
def build_big_batches(holder_len, lower_bd, upper_bd, Class_2, one_hop, s_t_r,
                      x_p_list, x_r_list, y_list,
                      relation2id, entity2id, id2relation, id2entity):
    
    if holder_len % 10 != 0:
        raise ValueError('We would like to take 10X as a big-batch size')
    
    #the set of all relation IDs
    relation_id_set = set()
    for i in range(len(id2relation)):
        
        if i not in id2relation:
            raise ValueError('error when generaing id2relation')
        
        relation_id_set.add(i)
    
    num_r = len(id2relation)
    
    #count how many appending has performed
    count = 0

    #in case not all entities in entity2id are in one_hop, 
    #so we need to find out who are indeed in
    existing_ids = set()
    
    for s_1 in one_hop:
        existing_ids.add(s_1)
        
    existing_ids = list(existing_ids)
    
    carry_on = True
    
    while carry_on:

        #obtain paths by dynamic programming
        source_id = random.choice(existing_ids)

        result, length_dict = Class_2.obtain_paths('direct_neighbour', source_id, 
                                                   'not_specified', lower_bd, upper_bd, one_hop)
        
        #We want to increase the diversity of paths and targets.
        #So we abandon one sub-graph from a source_id, if we sampled more than K1 path pairs
        #Note that we mean "sampled", not "appended"! 
        #We do not care whether the pair is actually appended.
        threshold_0 = 1000
        count_0 = 0
        
        for target_id in result:

            if (not carry_on) or (count_0 > threshold_0):
                break
            
            #we want to make sure s, t are indeed directly connected, 
            #otherwise there is no relation for positive sample
            #also, we want to make sure s and t and not connected by all relations, 
            #although this situation is rare. 
            #But in that case, there is no relation for negative samples
            #Also, we want at least two different paths here between s and t
            if ((source_id, target_id) in s_t_r) and (
                len(s_t_r[(source_id, target_id)]) < len(id2relation)) and (
                len(result[target_id]) >= 3):
                
                dir_r = list(s_t_r[(source_id, target_id)])
                
                non_dir_r = list(relation_id_set.difference(dir_r))
                
                if len(dir_r) <= 0:
                    
                    raise ValueError('errors when creating s_t_r !!')
                    
                temp_path_list = list(result[target_id])
                    
                #futhermore, we will abandon one targed_id if we sampled more than K2 times
                threshold_1 = 50
                count_1 = 0
                
                while count_1 <= threshold_1 and count_0 <= threshold_0:
                
                    temp_pair = random.sample(temp_path_list, 3)
                    
                    path_1, path_2, path_3 = temp_pair[0], temp_pair[1], temp_pair[2]                         

                    #proceed when the entire length not yet reached,
                    #and whether this path pair is new, and whether the two paths are different
                    #But it is optional to require the path to be new. 
                    #We may remove this requirment, especially for short paths
                    '''remember to cancel the comment below when using path_comb'''
                    if (carry_on) and (path_1 != path_2) and (path_2 != path_3) and (
                        path_1 != path_3):

                        #####positive#####################
                        #we randomly choose one direction relation as the target relation
                        relation_id = random.choice(dir_r)

                        #append the paths: note that we add the space holder id at the end
                        #of the shorter path
                        x_p_list['1'].append(list(path_1) + [num_r]*abs(len(path_1)-upper_bd))
                        x_p_list['2'].append(list(path_2) + [num_r]*abs(len(path_2)-upper_bd))
                        x_p_list['3'].append(list(path_3) + [num_r]*abs(len(path_3)-upper_bd))

                        #append relation
                        x_r_list.append([relation_id])
                        y_list.append(1.)

                        #####negative#####################
                        relation_id = random.choice(non_dir_r)

                        #append the paths: note that we add the space holder id at the end
                        #of the shorter path
                        x_p_list['1'].append(list(path_1) + [num_r]*abs(len(path_1)-upper_bd))
                        x_p_list['2'].append(list(path_2) + [num_r]*abs(len(path_2)-upper_bd))
                        x_p_list['3'].append(list(path_3) + [num_r]*abs(len(path_3)-upper_bd))

                        #append relation
                        x_r_list.append([relation_id])
                        y_list.append(0.)

                        ######add to path combinations#####
                        #here is the tricky part: we have to add both (path_s, path_l)
                        #and (path_l, path_s). This is because when the length are the same
                        #adding only one situation won't guarantee that 
                        #the same path with different order is also considered.
                        #in other words: path combination don't have order, but our dict does.
                        #so we have to add both situations.
                        '''remember to cancel the comment here when using path_comb'''
                        #path_comb[(len(path_s), len(path_l))].add((path_s, path_l))
                        #path_comb[(len(path_s), len(path_l))].add((path_l, path_s))

                        count += 2

                        if count % 20000 == 0:
                            print('generating big-batches', count, holder_len)

                    if len(y_list) >= holder_len:

                        carry_on = False
                        
                    count_1 += 1
                    count_0 += 1

### Build the big-batch for the one-hop neighbor network training

* At each step, we will select one triple (s,r,t) from the dataset. Then, the one-hop vector of s and t is generated respectively according to their out-going relations, and then appended to the corresponding source one-hop list and target one-hop list. 
* If this is a positive sample step, the id of relation r is appended to the relation list.
* If this is a negative sample step, the id of a randomly selected relation is chosen and appended.
* Similarly, one negative sample step always follows one positive step. The one-hop vectors from the previous positve step is used again for the negative step.

In [52]:
#function to build the big-batch for one-hope neighbor training
def build_big_batches_one_hop_nb(data, one_hop, s_t_r,
                      x_vec_list, x_r_list, y_list,
                      relation2id, entity2id, id2relation, id2entity):
    
    if holder_len % 10 != 0:
        raise ValueError('We would like to take 10X as a big-batch size')
    
    #the set of all relation IDs
    relation_id_set = set()
    for i in range(len(id2relation)):
        
        if i not in id2relation:
            raise ValueError('error when generaing id2relation')
        
        relation_id_set.add(i)
    
    num_r = len(id2relation)
        
    #count how many appending has performed
    count = 0
    
    data = list(data)    
    data = shuffle(data)
    
    for triple in data:
        
        s, r, t = triple[0], triple[1], triple[2] #obtain entities and relation IDs
        
        #build the one-hop vector for source and target entity
        s_vec = [0.]*len(id2relation)
        t_vec = [0.]*len(id2relation)
        
        #update source and target vec
        for r in one_hop[s]:
            s_vec[r] = 1.
        for r in one_hop[t]:
            t_vec[r] = 1.
        
        #positive sampling step
        x_vec_list['s'].append(s_vec)
        x_vec_list['t'].append(t_vec)
        x_r_list.append([r])
        y_list.append(1.)
        
        #negative sampling step
        non_dir_r = relation_id_set.difference({r})
        non_dir_r = list(non_dir_r)
        r_random = random.choice(non_dir_r)
        x_vec_list['s'].append(s_vec)
        x_vec_list['t'].append(t_vec)
        x_r_list.append([r_random])
        y_list.append(0.)
        
        #count += 2
        #if count % 10000 == 0:
        #    print('generating one-hope neighbor big batch', count, 2*len(data))
    print('one-hop big-batch built, start training')

### Start Training: load the KG and call classes

Here, we use the validation set to see the training efficiency. That is, we use the validation to check whether the true relation between entities can be predicted by paths.

The trick is: in validation, we have to use the same relation ID and entity ID as in the training. But we don't want to use the links in training anymore. That is, in validation, we want to use (and update if necessary) entity2id, id2entity, relation2id and id2relation. But we want to use new one_hop, data, data_ and s_t_r for validation set. Then, path-finding will also be based on new one_hop.


In [15]:
model_name

'Model_main_9_fb237_v4'

In [16]:
one_hop_model_name

'One_hop_model_main_9_fb237_v4'

In [17]:
ids_name

'IDs_main_9_fb237_v4'

In [18]:
#first, we save the relation and ids
Dict = dict()
Dict['one_hop'] = one_hop
Dict['data'] = data
Dict['s_t_r'] = s_t_r
Dict['entity2id'] = entity2id
Dict['id2entity'] = id2entity
Dict['relation2id'] = relation2id
Dict['id2relation'] = id2relation

with open('../weight_bin/' + ids_name + '.pickle', 'wb') as handle:
    pickle.dump(Dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
###train the path-based model
holder_len = 500000
lower_bd = 1
upper_bd = 10
num_epoch = 10
batch_size = 32

#90% to be train, 10% to be validation
train_len = 9*int(holder_len/10)
    
######################################
###pre-define the lists###############

#define the lists
x_p_list, x_r_list, y_list = {'1': [], '2': [], '3': []}, list(), list()

#######################################
###build the big-batches###############      

#fill in the training array list
build_big_batches(holder_len, lower_bd, upper_bd, Class_2, one_hop, s_t_r,
                      x_p_list, x_r_list, y_list,
                      relation2id, entity2id, id2relation, id2entity)

#shuffle the lists before train_test_plit
x_p_list['1'], x_p_list['2'], x_p_list['3'], x_r_list, y_list = shuffle(
               x_p_list['1'], x_p_list['2'], x_p_list['3'], x_r_list, y_list)

#######################################
###do the training#####################

#generate the input arrays
x_train_1 = np.asarray(x_p_list['1'][:train_len], dtype='int')
x_train_2 = np.asarray(x_p_list['2'][:train_len], dtype='int')
x_train_3 = np.asarray(x_p_list['3'][:train_len], dtype='int')
x_train_r = np.asarray(x_r_list[:train_len], dtype='int')
y_train = np.asarray(y_list[:train_len], dtype='int')

#generate the validation arrays
x_valid_1 = np.asarray(x_p_list['1'][train_len:], dtype='int')
x_valid_2 = np.asarray(x_p_list['2'][train_len:], dtype='int')
x_valid_3 = np.asarray(x_p_list['3'][train_len:], dtype='int')
x_valid_r = np.asarray(x_r_list[train_len:], dtype='int')
y_valid = np.asarray(y_list[train_len:], dtype='int')

model.fit([x_train_1, x_train_2, x_train_3, x_train_r], y_train, 
          validation_data=([x_valid_1, x_valid_2, x_valid_3, x_valid_r], y_valid),
          batch_size=batch_size, epochs=num_epoch)

# Save model and weights
add_h5 = model_name + '.h5'
save_dir = os.path.join(os.getcwd(), '../weight_bin')

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, add_h5)
model.save(model_path)
print('Save model')
del(model)

del(x_train_1, x_train_2, x_train_3, x_train_r, y_train)
del(x_valid_1, x_valid_2, x_valid_3, x_valid_r, y_valid)

del(x_p_list, x_r_list, y_list)

generating big-batches 20000 500000
generating big-batches 40000 500000
generating big-batches 60000 500000
generating big-batches 80000 500000
generating big-batches 100000 500000
generating big-batches 120000 500000
generating big-batches 140000 500000
generating big-batches 160000 500000
generating big-batches 180000 500000
generating big-batches 200000 500000
generating big-batches 220000 500000
generating big-batches 240000 500000
generating big-batches 260000 500000
generating big-batches 280000 500000
generating big-batches 300000 500000
generating big-batches 320000 500000
generating big-batches 340000 500000
generating big-batches 360000 500000
generating big-batches 380000 500000
generating big-batches 400000 500000
generating big-batches 420000 500000
generating big-batches 440000 500000
generating big-batches 460000 500000
generating big-batches 480000 500000
generating big-batches 500000 500000
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Ep

In [53]:
###train the one-hop neighbor based model
num_epoch = 20
batch_size = 32

#we put model.fit into iteration
for epoch in range(num_epoch):
    
    #define the lists
    x_vec_list, x_r_list, y_list = {'s': [], 't': []}, list(), list()

    #build the big-batch for one-hop neighbor
    build_big_batches_one_hop_nb(data, one_hop, s_t_r,
                          x_vec_list, x_r_list, y_list,
                          relation2id, entity2id, id2relation, id2entity)

    #shuffle the lists before train_test_plit
    x_vec_list['s'], x_vec_list['t'], x_r_list, y_list = shuffle(
                     x_vec_list['s'], x_vec_list['t'], x_r_list, y_list)

    #######################################
    ###do the training#####################

    #90% to be train, 10% to be validation
    train_len = 9*int(len(y_list)/10)

    #generate the input arrays
    x_train_s = np.asarray(x_vec_list['s'][:train_len], dtype='int')
    x_train_t = np.asarray(x_vec_list['t'][:train_len], dtype='int')
    x_train_r = np.asarray(x_r_list[:train_len], dtype='int')
    y_train = np.asarray(y_list[:train_len], dtype='int')

    #generate the validation arrays
    x_valid_s = np.asarray(x_vec_list['s'][train_len:], dtype='int')
    x_valid_t = np.asarray(x_vec_list['t'][train_len:], dtype='int')
    x_valid_r = np.asarray(x_r_list[train_len:], dtype='int')
    y_valid = np.asarray(y_list[train_len:], dtype='int')

    model_2.fit([x_train_s, x_train_t, x_train_r], y_train, 
              validation_data=([x_valid_s, x_valid_t, x_valid_r], y_valid),
              batch_size=batch_size, epochs=epoch+1, initial_epoch=epoch)
    
# Save model and weights
one_hop_add_h5 = one_hop_model_name + '.h5'
one_hop_save_dir = os.path.join(os.getcwd(), '../weight_bin')

if not os.path.isdir(one_hop_save_dir):
    os.makedirs(one_hop_save_dir)
one_hop_model_path = os.path.join(one_hop_save_dir, one_hop_add_h5)
model_2.save(one_hop_model_path)
print('Save model')
del(model_2)

del(x_train_s, x_train_t, x_train_r, y_train)
del(x_valid_s, x_valid_t, x_valid_r, y_valid)

del(x_vec_list, x_r_list, y_list)

one-hop big-batch built, start training
one-hop big-batch built, start training
Epoch 2/2
one-hop big-batch built, start training
Epoch 3/3
one-hop big-batch built, start training
Epoch 4/4
one-hop big-batch built, start training
Epoch 5/5
one-hop big-batch built, start training
Epoch 6/6
one-hop big-batch built, start training
Epoch 7/7
one-hop big-batch built, start training
Epoch 8/8
one-hop big-batch built, start training
Epoch 9/9
one-hop big-batch built, start training
Epoch 10/10
one-hop big-batch built, start training
Epoch 11/11
one-hop big-batch built, start training
Epoch 12/12
one-hop big-batch built, start training
Epoch 13/13
one-hop big-batch built, start training
Epoch 14/14
one-hop big-batch built, start training
Epoch 15/15
one-hop big-batch built, start training
Epoch 16/16
one-hop big-batch built, start training
Epoch 17/17
one-hop big-batch built, start training
Epoch 18/18
one-hop big-batch built, start training
Epoch 19/19
one-hop big-batch built, start training


### Result on the testset for inductive link prediction

We use the testset for inductive link prediction.

In [1]:
data_name = 'fb237_v4'
model_id = 'main_9'

In [3]:
#difine the names for saving
model_name = 'Model_' + model_id + '_' + data_name
one_hop_model_name = 'One_hop_model_' + model_id + '_' + data_name
ids_name = 'IDs_' + model_id + '_' + data_name

In [4]:
ids_name

'IDs_main_9_fb237_v4'

In [5]:
one_hop_model_name

'One_hop_model_main_9_fb237_v4'

In [6]:
model_name

'Model_main_9_fb237_v4'

In [7]:
import librosa
import opensmile
import os
import sys
import numpy as np
import random
from collections import defaultdict
from copy import deepcopy
import pickle

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.utils import plot_model

In [8]:
class LoadKG:
    
    def __init__(self):
        
        self.x = 'Hello'
        
    def load_train_data(self, data_path, one_hop, data, s_t_r, entity2id, id2entity,
                     relation2id, id2relation):
        
        data_ = set()
    
        ####load the train, valid and test set##########
        with open (data_path, 'r') as f:
            
            data_ini = f.readlines()
                        
            for i in range(len(data_ini)):
            
                x = data_ini[i].split()
                
                x_ = tuple(x)
                
                data_.add(x_)
        
        ####relation dict#################
        index = len(relation2id)
     
        for key in data_:
            
            if key[1] not in relation2id:
                
                relation = key[1]
                
                relation2id[relation] = index
                
                id2relation[index] = relation
                
                index += 1
                
                #the inverse relation
                iv_r = '_inverse_' + relation
                
                relation2id[iv_r] = index
                
                id2relation[index] = iv_r
                
                index += 1
        
        #get the id of the inverse relation, by above definition, initial relation has 
        #always even id, while inverse relation has always odd id.
        def inverse_r(r):
            
            if r % 2 == 0: #initial relation
                
                iv_r = r + 1
            
            else: #inverse relation
                
                iv_r = r - 1
            
            return(iv_r)
        
        ####entity dict###################
        index = len(entity2id)
        
        for key in data_:
            
            source, target = key[0], key[2]
            
            if source not in entity2id:
                                
                entity2id[source] = index
                
                id2entity[index] = source
                
                index += 1
            
            if target not in entity2id:
                
                entity2id[target] = index
                
                id2entity[index] = target
                
                index += 1
                
        #create the set of triples using id instead of string        
        for ele in data_:
            
            s = entity2id[ele[0]]
            
            r = relation2id[ele[1]]
            
            t = entity2id[ele[2]]
            
            if (s,r,t) not in data:
                
                data.add((s,r,t))
            
            s_t_r[(s,t)].add(r)
            
            if s not in one_hop:
                
                one_hop[s] = dict()
            
            if r not in one_hop[s]:
                
                one_hop[s][r] = set()
            
            one_hop[s][r].add(t)
            
            if t not in one_hop:
                
                one_hop[t] = dict()
            
            r_inv = inverse_r(r)
            
            s_t_r[(t,s)].add(r_inv)
            
            if r_inv not in one_hop[t]:
                
                one_hop[t][r_inv] = set()
            
            one_hop[t][r_inv].add(s)

In [9]:
class ObtainPathsByDynamicProgramming:

    def __init__(self, size_bd=50, threshold=500000):
                
        self.size_bd = size_bd #size bound limit the number of paths to a target entity t
        
        #number of times paths with specific length been performed for recursion
        self.threshold = threshold
        
    '''
    Given an entity s, the function will find the paths from s to other entities, using recursion.
    
    One may refer to LeetCode Problem 797 for details:
        https://leetcode.com/problems/all-paths-from-source-to-target/
    '''
    def obtain_paths(self, mode, s, t_input, lower_bd, upper_bd, one_hop):

        if type(lower_bd) != type(1) or lower_bd < 1:
            
            raise TypeError("!!! invalid lower bound setting, must >= 1 !!!")
            
        if type(upper_bd) != type(1) or upper_bd < 1:
            
            raise TypeError("!!! invalid upper bound setting, must >= 1 !!!")
            
        if lower_bd > upper_bd:
            
            raise TypeError("!!! lower bound must not exced upper bound !!!")
            
        if s not in one_hop:
            
            raise ValueError('!!! entity not in one_hop. Please work on existing entities')
        
        #here is the result dict. Its key is each entity t sharing paths from s
        #The value of each t is a set containing the paths from s to t
        #These paths can be either the direct connection r, or a multi-hop path
        res = defaultdict(set)
        
        #qualified_t contains the types of t we want to consider,
        #that is, what t will be added to the result set.
        qualified_t = set()
        
        #under this mode, we will only consider the direct neighbour of s
        if mode == 'direct_neighbour':
        
            for r in one_hop[s]:
            
                for t in one_hop[s][r]:
                
                    qualified_t.add(t)
        
        #under this mode, we will only consider one specified entity t
        elif mode == 'target_specified':
            
            qualified_t.add(t_input)
        
        #under this mode, we will consider any entity
        elif mode == 'any_target':
            
            for s_any in one_hop:
                
                qualified_t.add(s_any)
                
        else:
            
            raise ValueError('not a valid mode')
        
        '''
        We use recursion to find the paths
        On current node with the path [r1, ..., rk] and on-path entities {s, e1, ..., ek-1, node}
        from s to this node, we will further find the direct neighbor t' of this node. 
        If t' is not an on-path entity (not among s, e1,...ek-1, node), we recursively proceed to t' 
        '''
        def helper(node, path, on_path_en, res, qualified_t, lower_bd, upper_bd, one_hop, count_dict):

            #when the current path is within lower_bd and upper_bd, 
            #and the node is among the qualified t, and it has not been fill of paths w.r.t size_limit,
            #we will add this path to the node
            if (len(path) >= lower_bd) and (len(path) <= upper_bd) and (
                node in qualified_t) and (len(res[node]) < self.size_bd):
                
                res[node].add(tuple(path))
                    
            #won't start new recursions if the current path length already reaches upper limit
            #or the number of recursions performed on this length has reached the limit
            if (len(path) < upper_bd) and (count_dict[len(path)] <= self.threshold):
         
                #find all the directly connected (r,t) pair to this current node
                #these pairs indicates potentially the next recursion
                potential_pair = set()

                for r in one_hop[node]:

                    for t in one_hop[node][r]:

                        potential_pair.add((r,t))

                potential_pair = list(potential_pair)
                
                random.shuffle(potential_pair)
                                
                for Tuple in potential_pair:
                    
                    r, t = Tuple[0], Tuple[1]
                    
                    count_dict[len(path)] += 1
                    
                    #if t not on the path, then finally proceed to next recursion
                    if (t not in on_path_en) and (count_dict[len(path)] <= self.threshold):

                        helper(t, path + [r], on_path_en.union({t}), res, qualified_t, 
                               lower_bd, upper_bd, one_hop, count_dict)
        
        length_dict = defaultdict(int)
        count_dict = defaultdict(int)
        
        helper(s, [], {s}, res, qualified_t, lower_bd, upper_bd, one_hop, count_dict)
        
        return(res, count_dict)

In [10]:
#load the classes
Class_1 = LoadKG()
Class_2 = ObtainPathsByDynamicProgramming()

In [11]:
#load ids and relation/entity dicts
with open('../weight_bin/' + ids_name + '.pickle', 'rb') as handle:
    Dict = pickle.load(handle)

one_hop = Dict['one_hop']
data = Dict['data']
s_t_r = Dict['s_t_r']
entity2id = Dict['entity2id']
id2entity = Dict['id2entity']
relation2id = Dict['relation2id']
id2relation = Dict['id2relation']

#we want to keep the initial entity/relation dicts
entity2id_ini = deepcopy(entity2id)
id2entity_ini = deepcopy(id2entity)
relation2id_ini = deepcopy(relation2id)
id2relation_ini = deepcopy(id2relation)

num_r = len(id2relation)
num_r

438

In [12]:
ids_name

'IDs_main_9_fb237_v4'

In [13]:
model_name

'Model_main_9_fb237_v4'

In [14]:
#load the model
model = keras.models.load_model('../weight_bin/' + model_name + '.h5')

2023-02-23 21:34:57.455662: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
#load the one-hop neighbor model
model_2 = keras.models.load_model('../weight_bin/' + one_hop_model_name + '.h5')

In [16]:
ind_train_path = '../data/' + data_name + '_ind/train.txt'
ind_valid_path = '../data/' + data_name + '_ind/valid.txt'
ind_test_path = '../data/' + data_name + '_ind/test.txt'

In [17]:
#load the test dataset
one_hop_ind = dict() 
data_ind = set()
s_t_r_ind = defaultdict(set)

len_0 = len(relation2id)
size_0 = len(entity2id)

#fill in the sets and dicts
Class_1.load_train_data(ind_train_path, 
                        one_hop_ind, data_ind, s_t_r_ind,
                        entity2id, id2entity, relation2id, id2relation)

len_1 = len(relation2id)
size_1 = len(entity2id)

if len_0 != len_1:
    raise ValueError('unseen relation!')

In [18]:
print(size_0, size_1, len(data_ind))

4707 7758 11714


In [19]:
#load the test dataset
one_hop_test = dict() 
data_test = set()
s_t_r_test = defaultdict(set)

len_0 = len(relation2id)
size_0 = len(entity2id)

#fill in the sets and dicts
Class_1.load_train_data(ind_test_path, 
                        one_hop_test, data_test, s_t_r_test,
                        entity2id, id2entity, relation2id, id2relation)


len_1 = len(relation2id)
size_1 = len(entity2id)

if len_0 != len_1:
    raise ValueError('unseen relation!')

In [20]:
print(size_0, size_1, len(data_test))

7758 7758 1424


In [21]:
#load the validation for existing triple removal when ranking
one_hop_valid = dict() 
data_valid = set()
s_t_r_valid = defaultdict(set)

len_0 = len(relation2id)
size_0 = len(entity2id)

#fill in the sets and dicts
Class_1.load_train_data(ind_valid_path, 
                        one_hop_valid, data_valid, s_t_r_valid,
                        entity2id, id2entity, relation2id, id2relation)

len_1 = len(relation2id)
size_1 = len(entity2id)

if len_0 != len_1:
    raise ValueError('unseen relation!')

In [22]:
print(size_0, size_1, len(data_valid))

7758 7758 1416


In [23]:
print(len(entity2id), len(entity2id_ini))

7758 4707


In [24]:
valid_path = '../data/' + data_name + '/valid.txt'
test_path = '../data/' + data_name + '/test.txt'

In [25]:
#load the test dataset
one_hop_train_test = dict() 
data_train_test = set()
s_t_r_train_test = defaultdict(set)

len_0 = len(relation2id)
size_0 = len(entity2id)

#fill in the sets and dicts
Class_1.load_train_data(test_path, 
                        one_hop_train_test, data_train_test, s_t_r_train_test,
                        entity2id, id2entity, relation2id, id2relation)


len_1 = len(relation2id)
size_1 = len(entity2id)

if len_0 != len_1:
    raise ValueError('unseen relation!')

In [26]:
#load the validation for existing triple removal when ranking
one_hop_train_valid = dict() 
data_train_valid = set()
s_t_r_train_valid = defaultdict(set)

len_0 = len(relation2id)
size_0 = len(entity2id)

#fill in the sets and dicts
Class_1.load_train_data(valid_path, 
                        one_hop_train_valid, data_train_valid, s_t_r_train_valid,
                        entity2id, id2entity, relation2id, id2relation)

len_1 = len(relation2id)
size_1 = len(entity2id)

if len_0 != len_1:
    raise ValueError('unseen relation!')

In [27]:
#we want to check whether there are overlapping 
#between the entities of train triples and inductive test and valid triples
overlapping = 0

for ele in data_test:
    
    s, r, t = ele[0], ele[1], ele[2]
    
    if s in id2entity_ini or t in id2entity_ini:
        
        overlapping += 1
        
overlapping

0

In [28]:
overlapping = 0

for ele in data_valid:
    
    s, r, t = ele[0], ele[1], ele[2]
    
    if s in id2entity_ini or t in id2entity_ini:
        
        overlapping += 1
        
overlapping

0

In [29]:
#we want to check whether there are overlapping 
#between the entities of train triples and inductive test and valid triples
overlapping = 0

for ele in data_ind:
    
    s, r, t = ele[0], ele[1], ele[2]
    
    if s in id2entity_ini or t in id2entity_ini:
        
        overlapping += 1
        
overlapping

0

**Fine tune if necessary**

In [32]:
#function to build all the big batches
def build_big_batches(holder_len, lower_bd, upper_bd, Class_2, one_hop, s_t_r,
                      x_p_list, x_r_list, y_list,
                      relation2id, entity2id, id2relation, id2entity):
    
    if holder_len % 10 != 0:
        raise ValueError('We would like to take 10X as a big-batch size')
    
    #the set of all relation IDs
    relation_id_set = set()
    for i in range(len(id2relation)):
        
        if i not in id2relation:
            raise ValueError('error when generaing id2relation')
        
        relation_id_set.add(i)
    
    num_r = len(id2relation)
    
    #count how many appending has performed
    count = 0

    #in case not all entities in entity2id are in one_hop, 
    #so we need to find out who are indeed in
    existing_ids = set()
    
    for s_1 in one_hop:
        existing_ids.add(s_1)
        
    existing_ids = list(existing_ids)
    
    carry_on = True
    
    while carry_on:

        #obtain paths by dynamic programming
        source_id = random.choice(existing_ids)

        result, length_dict = Class_2.obtain_paths('direct_neighbour', source_id, 
                                                   'not_specified', lower_bd, upper_bd, one_hop)
        
        #We want to increase the diversity of paths and targets.
        #So we abandon one sub-graph from a source_id, if we sampled more than K1 path pairs
        #Note that we mean "sampled", not "appended"! 
        #We do not care whether the pair is actually appended.
        threshold_0 = 1000
        count_0 = 0
        
        for target_id in result:

            if (not carry_on) or (count_0 > threshold_0):
                break
            
            #we want to make sure s, t are indeed directly connected, 
            #otherwise there is no relation for positive sample
            #also, we want to make sure s and t and not connected by all relations, 
            #although this situation is rare. 
            #But in that case, there is no relation for negative samples
            #Also, we want at least two different paths here between s and t
            if ((source_id, target_id) in s_t_r) and (
                len(s_t_r[(source_id, target_id)]) < len(id2relation)) and (
                len(result[target_id]) >= 3):
                
                dir_r = list(s_t_r[(source_id, target_id)])
                
                non_dir_r = list(relation_id_set.difference(dir_r))
                
                if len(dir_r) <= 0:
                    
                    raise ValueError('errors when creating s_t_r !!')
                    
                temp_path_list = list(result[target_id])
                    
                #futhermore, we will abandon one targed_id if we sampled more than K2 times
                threshold_1 = 50
                count_1 = 0
                
                while count_1 <= threshold_1 and count_0 <= threshold_0:
                
                    temp_pair = random.sample(temp_path_list, 3)
                    
                    path_1, path_2, path_3 = temp_pair[0], temp_pair[1], temp_pair[2]                         

                    #proceed when the entire length not yet reached,
                    #and whether this path pair is new, and whether the two paths are different
                    #But it is optional to require the path to be new. 
                    #We may remove this requirment, especially for short paths
                    '''remember to cancel the comment below when using path_comb'''
                    if (carry_on) and (path_1 != path_2) and (path_2 != path_3) and (
                        path_1 != path_3):

                        #####positive#####################
                        #we randomly choose one direction relation as the target relation
                        relation_id = random.choice(dir_r)

                        #append the paths: note that we add the space holder id at the end
                        #of the shorter path
                        x_p_list['1'].append(list(path_1) + [num_r]*abs(len(path_1)-upper_bd))
                        x_p_list['2'].append(list(path_2) + [num_r]*abs(len(path_2)-upper_bd))
                        x_p_list['3'].append(list(path_3) + [num_r]*abs(len(path_3)-upper_bd))

                        #append relation
                        x_r_list.append([relation_id])
                        y_list.append(1.)

                        #####negative#####################
                        relation_id = random.choice(non_dir_r)

                        #append the paths: note that we add the space holder id at the end
                        #of the shorter path
                        x_p_list['1'].append(list(path_1) + [num_r]*abs(len(path_1)-upper_bd))
                        x_p_list['2'].append(list(path_2) + [num_r]*abs(len(path_2)-upper_bd))
                        x_p_list['3'].append(list(path_3) + [num_r]*abs(len(path_3)-upper_bd))

                        #append relation
                        x_r_list.append([relation_id])
                        y_list.append(0.)

                        ######add to path combinations#####
                        #here is the tricky part: we have to add both (path_s, path_l)
                        #and (path_l, path_s). This is because when the length are the same
                        #adding only one situation won't guarantee that 
                        #the same path with different order is also considered.
                        #in other words: path combination don't have order, but our dict does.
                        #so we have to add both situations.
                        '''remember to cancel the comment here when using path_comb'''
                        #path_comb[(len(path_s), len(path_l))].add((path_s, path_l))
                        #path_comb[(len(path_s), len(path_l))].add((path_l, path_s))

                        count += 2

                        if count % 2000 == 0:
                            print('generating big-batches', count, holder_len)

                    if len(y_list) >= holder_len:

                        carry_on = False
                        
                    count_1 += 1
                    count_0 += 1

In [None]:
fine_tune = 'No'

In [34]:
#define the lists
x_p_list, x_r_list, y_list = {'1': [], '2': [], '3': []}, list(), list()

#######################################
###build the big-batches###############      

#fill in the training array list
build_big_batches(20000, 1, 10, Class_2, one_hop_ind, s_t_r_ind,
                      x_p_list, x_r_list, y_list,
                      relation2id, entity2id, id2relation, id2entity)

x_test_1 = np.asarray(x_p_list['1'], dtype='int')
x_test_2 = np.asarray(x_p_list['2'], dtype='int')
x_test_3 = np.asarray(x_p_list['3'], dtype='int')
x_test_r = np.asarray(x_r_list, dtype='int')
y_test = np.asarray(y_list, dtype='int')

if fine_tune == 'Yes':

    model.fit([x_test_1, x_test_2, x_test_3, x_test_r], y_test, batch_size=4, epochs=2)
    
else:
    
    model.evaluate([x_test_1, x_test_2, x_test_3, x_test_r], y_test, batch_size=4)

generating big-batches 2000 20000
generating big-batches 4000 20000
generating big-batches 6000 20000
generating big-batches 8000 20000
generating big-batches 10000 20000
generating big-batches 12000 20000
generating big-batches 14000 20000
generating big-batches 16000 20000
generating big-batches 18000 20000
generating big-batches 20000 20000


In [35]:
#the function to do path-based relation scoring
def path_based_relation_scoring(s, t, lower_bd, upper_bd, one_hop, id2relation, model):
    
    path_holder = set()
    
    for iteration in range(2):
    
        result, length_dict = Class_2.obtain_paths('target_specified', 
                                                   s, t, lower_bd, upper_bd, one_hop)
        if t in result:
            
            for path in result[t]:
                
                path_holder.add(path)
    
    path_holder = list(path_holder)
    random.shuffle(path_holder)
    
    score_dict = defaultdict(float)
    
    count = 0
    
    if len(path_holder) >= 3:
    
        #iterate over path_1
        while count <= 20:

            temp_pair = random.sample(path_holder, 3)

            path_1, path_2, path_3 = temp_pair[0], temp_pair[1], temp_pair[2]

            list_1 = list()
            list_2 = list()
            list_3 = list()
            list_r = list()

            for i in range(len(id2relation)):

                if i not in id2relation:

                    raise ValueError ('error when generating id2relation')

                list_1.append(list(path_1) + [num_r]*abs(len(path_1)-upper_bd))
                list_2.append(list(path_2) + [num_r]*abs(len(path_2)-upper_bd))
                list_3.append(list(path_3) + [num_r]*abs(len(path_3)-upper_bd))
                list_r.append([i])

            input_1 = np.array(list_1)
            input_2 = np.array(list_2)
            input_3 = np.array(list_3)
            input_r = np.array(list_r)

            pred = model.predict([input_1, input_2, input_3, input_r], verbose = 0)

            for i in range(pred.shape[0]):

                score_dict[i] += float(pred[i])

            count += 1
    
    print(len(score_dict), len(path_holder))

    return(score_dict)

In [36]:
#one hop neighbor based relation scoring
def one_hop_nb_relation_scoring(s, t, one_hop, id2relation, model_2):
    
    #lists holding the input to the network
    list_s = list()
    list_t = list()
    list_r = list()
    
    score_dict = defaultdict(float)
    
    #build the one-hop vector for source and target entity
    s_vec = [0.]*len(id2relation)
    t_vec = [0.]*len(id2relation)

    #update source and target vec
    for r in one_hop[s]:
        s_vec[r] = 1.
    for r in one_hop[t]:
        t_vec[r] = 1.

    #generate the input batch
    for i in range(len(id2relation)):

        if i not in id2relation:

            raise ValueError ('error when generating id2relation')

        list_s.append(s_vec)
        list_t.append(t_vec)
        list_r.append([i])
        
    #switch to arrays
    input_s = np.array(list_s)
    input_t = np.array(list_t)
    input_r = np.array(list_r)
    
    #run the model
    pred = model_2.predict([input_s, input_t, input_r], verbose = 0)
    
    for i in range(pred.shape[0]):

        score_dict[i] += float(pred[i])
        
    return(score_dict)

#### Not fine tuned 

In [37]:
########################################################
#obtain the precision-recall area under curve (AUC-PR)##

#randomly select 10% of the triples
selected = random.sample(list(data_test), min(len(data_test), 500))

random.shuffle(selected)

###Hit at 1#############################
#generate the negative samples by randomly replace relation with all the other relaiton
Hits_at_1 = 0
Hits_at_3 = 0
Hits_at_10 = 0
MRR_raw = 0.

for i in range(len(selected)):
    
    s_true, r_true, t_true = selected[i][0], selected[i][1], selected[i][2]
    
    #first run the path-based scoring
    score_dict = path_based_relation_scoring(s_true, t_true, 1, 10, one_hop_ind, id2relation, model)
    
    #run the one-hop neighbour based scoring when not enough paths
    if len(score_dict) == 0:
        del(score_dict)
        score_dict = one_hop_nb_relation_scoring(s_true, t_true, one_hop_ind, id2relation, model_2)
    
    #[... [score, r], ...]
    temp_list = list()
    
    for r in id2relation:
        
        if r in score_dict:
            
            temp_list.append([score_dict[r], r])
            
        else:
            
            temp_list.append([0.0, r])
        
    sorted_list = sorted(temp_list, key = lambda x: x[0], reverse=True)
    
    p = 0
    exist_tri = 0
    
    while p < len(sorted_list) and sorted_list[p][1] != r_true:
        
        #moreover, we want to remove existing triples
        if ((s_true, sorted_list[p][1], t_true) in data_test) or (
            (s_true, sorted_list[p][1], t_true) in data_valid) or (
            (s_true, sorted_list[p][1], t_true) in data_ind) or (
            (s_true, sorted_list[p][1], t_true) in data) or (
            (s_true, sorted_list[p][1], t_true) in data_train_valid) or (
            (s_true, sorted_list[p][1], t_true) in data_train_test):
            
            exist_tri += 1
            
        p += 1
    
    if p - exist_tri == 0:
        
        Hits_at_1 += 1
        
    if p - exist_tri < 3:
        
        Hits_at_3 += 1
        
    if p - exist_tri < 10:
        
        Hits_at_10 += 1
        
    MRR_raw += 1./float(p - exist_tri + 1.) 
        
    print('checkcorrect', r_true, sorted_list[p][1],
          'real score', sorted_list[p][0],
          'Hits@1', Hits_at_1/(i+1),
          'Hits@3', Hits_at_3/(i+1),
          'Hits@10', Hits_at_10/(i+1),
          'MRR', MRR_raw/(i+1),
          'cur_rank', p - exist_tri,
          'abs_cur_rank', p,
          'total_num', i, len(selected))

438 100
checkcorrect 40 40 real score 20.994927287101746 Hits@1 1.0 Hits@3 1.0 Hits@10 1.0 MRR 1.0 cur_rank 0 abs_cur_rank 0 total_num 0 500
438 100
checkcorrect 402 402 real score 20.55150145292282 Hits@1 0.5 Hits@3 0.5 Hits@10 1.0 MRR 0.6 cur_rank 4 abs_cur_rank 4 total_num 1 500
438 100
checkcorrect 58 58 real score 20.276334822177887 Hits@1 0.3333333333333333 Hits@3 0.3333333333333333 Hits@10 1.0 MRR 0.48333333333333334 cur_rank 3 abs_cur_rank 3 total_num 2 500
438 100
checkcorrect 76 76 real score 20.705363750457764 Hits@1 0.25 Hits@3 0.5 Hits@10 1.0 MRR 0.4875 cur_rank 1 abs_cur_rank 1 total_num 3 500
438 28
checkcorrect 330 330 real score 17.890938855707645 Hits@1 0.4 Hits@3 0.6 Hits@10 1.0 MRR 0.5900000000000001 cur_rank 0 abs_cur_rank 0 total_num 4 500
438 100
checkcorrect 40 40 real score 20.994918763637543 Hits@1 0.5 Hits@3 0.6666666666666666 Hits@10 1.0 MRR 0.6583333333333333 cur_rank 0 abs_cur_rank 0 total_num 5 500
438 100
checkcorrect 44 44 real score 20.9811732172966 Hi

438 100
checkcorrect 0 0 real score 20.361797034740448 Hits@1 0.4782608695652174 Hits@3 0.7608695652173914 Hits@10 0.8695652173913043 MRR 0.6312406604444799 cur_rank 1 abs_cur_rank 2 total_num 45 500
438 100
checkcorrect 40 40 real score 20.993821680545807 Hits@1 0.48936170212765956 Hits@3 0.7659574468085106 Hits@10 0.8723404255319149 MRR 0.6390866038392782 cur_rank 0 abs_cur_rank 0 total_num 46 500
438 100
checkcorrect 2 2 real score 20.072351336479187 Hits@1 0.4791666666666667 Hits@3 0.7708333333333334 Hits@10 0.875 MRR 0.6361889662592932 cur_rank 1 abs_cur_rank 1 total_num 47 500
438 100
checkcorrect 58 58 real score 13.934642545878887 Hits@1 0.46938775510204084 Hits@3 0.7551020408163265 Hits@10 0.8775510204081632 MRR 0.6283075587846138 cur_rank 3 abs_cur_rank 3 total_num 48 500
438 100
checkcorrect 2 2 real score 20.98995792865753 Hits@1 0.48 Hits@3 0.76 Hits@10 0.88 MRR 0.6357414076089215 cur_rank 0 abs_cur_rank 0 total_num 49 500
438 100
checkcorrect 92 92 real score 16.867967233

438 100
checkcorrect 0 0 real score 20.916797161102295 Hits@1 0.5113636363636364 Hits@3 0.7840909090909091 Hits@10 0.875 MRR 0.6586606565135027 cur_rank 0 abs_cur_rank 0 total_num 87 500
438 99
checkcorrect 164 164 real score -0.21128455083817244 Hits@1 0.5056179775280899 Hits@3 0.7752808988764045 Hits@10 0.8651685393258427 MRR 0.6513876562870997 cur_rank 87 abs_cur_rank 87 total_num 88 500
438 100
checkcorrect 12 12 real score 20.039285957813263 Hits@1 0.5 Hits@3 0.7777777777777778 Hits@10 0.8666666666666667 MRR 0.649705571217243 cur_rank 1 abs_cur_rank 1 total_num 89 500
438 100
checkcorrect 254 254 real score 20.640793681144714 Hits@1 0.5054945054945055 Hits@3 0.7802197802197802 Hits@10 0.8681318681318682 MRR 0.6535549605445261 cur_rank 0 abs_cur_rank 0 total_num 90 500
438 100
checkcorrect 204 204 real score 20.645303905010223 Hits@1 0.5108695652173914 Hits@3 0.782608695652174 Hits@10 0.8695652173913043 MRR 0.6573206674951291 cur_rank 0 abs_cur_rank 0 total_num 91 500
438 30
checkc

438 100
checkcorrect 44 44 real score 20.981164753437042 Hits@1 0.5769230769230769 Hits@3 0.7923076923076923 Hits@10 0.8769230769230769 MRR 0.6961374922477759 cur_rank 0 abs_cur_rank 0 total_num 129 500
438 100
checkcorrect 44 44 real score 20.981173038482666 Hits@1 0.5801526717557252 Hits@3 0.7938931297709924 Hits@10 0.8778625954198473 MRR 0.6984570533756554 cur_rank 0 abs_cur_rank 0 total_num 130 500
438 100
checkcorrect 160 160 real score 18.85846644639969 Hits@1 0.5757575757575758 Hits@3 0.7954545454545454 Hits@10 0.8787878787878788 MRR 0.695690964587456 cur_rank 2 abs_cur_rank 2 total_num 131 500
438 29
checkcorrect 398 398 real score 18.031884014606476 Hits@1 0.5714285714285714 Hits@3 0.7894736842105263 Hits@10 0.8796992481203008 MRR 0.6923399047033398 cur_rank 3 abs_cur_rank 3 total_num 132 500
438 100
checkcorrect 12 12 real score 18.41745615005493 Hits@1 0.5671641791044776 Hits@3 0.7910447761194029 Hits@10 0.8805970149253731 MRR 0.6896607511856532 cur_rank 2 abs_cur_rank 2 tot

438 100
checkcorrect 424 424 real score 20.67566806077957 Hits@1 0.5614035087719298 Hits@3 0.7953216374269005 Hits@10 0.8830409356725146 MRR 0.6889580091838722 cur_rank 1 abs_cur_rank 1 total_num 170 500
438 100
checkcorrect 276 276 real score -0.2693821704015136 Hits@1 0.5581395348837209 Hits@3 0.7906976744186046 Hits@10 0.877906976744186 MRR 0.6849933826974608 cur_rank 141 abs_cur_rank 141 total_num 171 500
438 100
checkcorrect 56 56 real score 20.94491571187973 Hits@1 0.5549132947976878 Hits@3 0.791907514450867 Hits@10 0.8786127167630058 MRR 0.6839240567859148 cur_rank 1 abs_cur_rank 1 total_num 172 500
438 100
checkcorrect 6 6 real score 20.981012403964996 Hits@1 0.5574712643678161 Hits@3 0.7931034482758621 Hits@10 0.8793103448275862 MRR 0.6857405851951911 cur_rank 0 abs_cur_rank 0 total_num 173 500
438 100
checkcorrect 288 288 real score 20.701012790203094 Hits@1 0.5542857142857143 Hits@3 0.7942857142857143 Hits@10 0.88 MRR 0.6846792104226472 cur_rank 1 abs_cur_rank 1 total_num 17

438 100
checkcorrect 48 48 real score 20.98258763551712 Hits@1 0.5613207547169812 Hits@3 0.7971698113207547 Hits@10 0.8820754716981132 MRR 0.6892371139478521 cur_rank 0 abs_cur_rank 0 total_num 211 500
438 100
checkcorrect 0 0 real score 19.70480865240097 Hits@1 0.5586854460093896 Hits@3 0.7981220657276995 Hits@10 0.8826291079812206 MRR 0.6875662041797089 cur_rank 2 abs_cur_rank 3 total_num 212 500
438 100
checkcorrect 32 32 real score 20.82016831636429 Hits@1 0.5607476635514018 Hits@3 0.7990654205607477 Hits@10 0.883177570093458 MRR 0.689026175188215 cur_rank 0 abs_cur_rank 0 total_num 213 500
438 100
checkcorrect 48 48 real score 20.982596695423126 Hits@1 0.5627906976744186 Hits@3 0.8 Hits@10 0.8837209302325582 MRR 0.6904725650710605 cur_rank 0 abs_cur_rank 0 total_num 214 500
438 98
checkcorrect 324 324 real score 20.188436567783356 Hits@1 0.5601851851851852 Hits@3 0.7962962962962963 Hits@10 0.8842592592592593 MRR 0.6882018587512869 cur_rank 4 abs_cur_rank 4 total_num 215 500
438 10

438 100
checkcorrect 212 212 real score -0.23971296567469835 Hits@1 0.541501976284585 Hits@3 0.7865612648221344 Hits@10 0.8814229249011858 MRR 0.672621776096081 cur_rank 131 abs_cur_rank 132 total_num 252 500
438 100
checkcorrect 10 10 real score 20.97802323102951 Hits@1 0.5433070866141733 Hits@3 0.7874015748031497 Hits@10 0.8818897637795275 MRR 0.673910666741372 cur_rank 0 abs_cur_rank 0 total_num 253 500
438 100
checkcorrect 0 0 real score 20.736046731472015 Hits@1 0.5450980392156862 Hits@3 0.788235294117647 Hits@10 0.8823529411764706 MRR 0.6751894484404254 cur_rank 0 abs_cur_rank 0 total_num 254 500
438 30
checkcorrect 208 208 real score 18.831463634967804 Hits@1 0.546875 Hits@3 0.7890625 Hits@10 0.8828125 MRR 0.676458239657455 cur_rank 0 abs_cur_rank 0 total_num 255 500
438 100
checkcorrect 6 6 real score 20.981008052825928 Hits@1 0.5486381322957199 Hits@3 0.7898832684824902 Hits@10 0.8832684824902723 MRR 0.6777171570128735 cur_rank 0 abs_cur_rank 0 total_num 256 500
438 100
checkc

438 100
checkcorrect 114 114 real score 20.584188222885132 Hits@1 0.5476190476190477 Hits@3 0.7891156462585034 Hits@10 0.8877551020408163 MRR 0.6778602233470841 cur_rank 2 abs_cur_rank 2 total_num 293 500
438 100
checkcorrect 44 44 real score 20.981162011623383 Hits@1 0.5491525423728814 Hits@3 0.7898305084745763 Hits@10 0.888135593220339 MRR 0.6789522225899753 cur_rank 0 abs_cur_rank 0 total_num 294 500
438 92
checkcorrect 32 32 real score 20.869562089443207 Hits@1 0.5506756756756757 Hits@3 0.7905405405405406 Hits@10 0.8885135135135135 MRR 0.6800368434596038 cur_rank 0 abs_cur_rank 0 total_num 295 500
438 100
checkcorrect 6 6 real score 20.981005549430847 Hits@1 0.5521885521885522 Hits@3 0.7912457912457912 Hits@10 0.8888888888888888 MRR 0.6811141604849923 cur_rank 0 abs_cur_rank 0 total_num 296 500
438 100
checkcorrect 44 44 real score 20.98124647140503 Hits@1 0.5536912751677853 Hits@3 0.7919463087248322 Hits@10 0.889261744966443 MRR 0.6821842471947742 cur_rank 0 abs_cur_rank 0 total_n

0 2
checkcorrect 138 138 real score -0.03357440605759621 Hits@1 0.5313432835820896 Hits@3 0.7761194029850746 Hits@10 0.8865671641791045 MRR 0.6643648728301277 cur_rank 140 abs_cur_rank 140 total_num 334 500
438 100
checkcorrect 136 136 real score 20.848798036575317 Hits@1 0.5327380952380952 Hits@3 0.7767857142857143 Hits@10 0.8869047619047619 MRR 0.6653637868990857 cur_rank 0 abs_cur_rank 0 total_num 335 500
438 92
checkcorrect 424 424 real score 20.675136506557465 Hits@1 0.5311572700296736 Hits@3 0.7774480712166172 Hits@10 0.887240356083086 MRR 0.6648730931694148 cur_rank 1 abs_cur_rank 1 total_num 336 500
438 100
checkcorrect 30 30 real score 20.48230242729187 Hits@1 0.5295857988165681 Hits@3 0.7781065088757396 Hits@10 0.8875739644970414 MRR 0.6643853029529373 cur_rank 1 abs_cur_rank 1 total_num 337 500
438 100
checkcorrect 32 32 real score 19.25139832496643 Hits@1 0.528023598820059 Hits@3 0.7787610619469026 Hits@10 0.887905604719764 MRR 0.6634087484702835 cur_rank 2 abs_cur_rank 2 t

438 100
checkcorrect 146 146 real score 20.476574301719666 Hits@1 0.5186170212765957 Hits@3 0.773936170212766 Hits@10 0.8909574468085106 MRR 0.6583182591898237 cur_rank 1 abs_cur_rank 1 total_num 375 500
438 100
checkcorrect 348 348 real score -1.00927053228952 Hits@1 0.5172413793103449 Hits@3 0.7718832891246684 Hits@10 0.8885941644562334 MRR 0.6565807823051935 cur_rank 303 abs_cur_rank 303 total_num 376 500
438 100
checkcorrect 164 164 real score 15.967915058135986 Hits@1 0.5185185185185185 Hits@3 0.7724867724867724 Hits@10 0.8888888888888888 MRR 0.6574892987541214 cur_rank 0 abs_cur_rank 0 total_num 377 500
438 100
checkcorrect 288 288 real score -0.6618673035409302 Hits@1 0.5171503957783641 Hits@3 0.7704485488126649 Hits@10 0.8865435356200527 MRR 0.655765871435054 cur_rank 231 abs_cur_rank 231 total_num 378 500
438 100
checkcorrect 0 0 real score 20.89651310443878 Hits@1 0.5184210526315789 Hits@3 0.7710526315789473 Hits@10 0.8868421052631579 MRR 0.6566717507207513 cur_rank 0 abs_cur

438 100
checkcorrect 12 12 real score 19.993974149227142 Hits@1 0.5203836930455635 Hits@3 0.7721822541966427 Hits@10 0.8896882494004796 MRR 0.6579284562300034 cur_rank 0 abs_cur_rank 1 total_num 416 500
438 100
checkcorrect 6 6 real score 20.98101508617401 Hits@1 0.5215311004784688 Hits@3 0.7727272727272727 Hits@10 0.8899521531100478 MRR 0.6587468092055296 cur_rank 0 abs_cur_rank 0 total_num 417 500
438 100
checkcorrect 118 118 real score 20.880899727344513 Hits@1 0.5202863961813843 Hits@3 0.7732696897374701 Hits@10 0.8902147971360382 MRR 0.658367938539168 cur_rank 1 abs_cur_rank 1 total_num 418 500
438 100
checkcorrect 282 282 real score 20.6816583275795 Hits@1 0.5190476190476191 Hits@3 0.7714285714285715 Hits@10 0.8904761904761904 MRR 0.6572765863045509 cur_rank 4 abs_cur_rank 4 total_num 419 500
438 100
checkcorrect 10 10 real score 20.978016078472137 Hits@1 0.5201900237529691 Hits@3 0.7719714964370546 Hits@10 0.8907363420427553 MRR 0.6580906561708109 cur_rank 0 abs_cur_rank 0 total

438 100
checkcorrect 134 134 real score 0.22961010364815593 Hits@1 0.5283842794759825 Hits@3 0.7729257641921398 Hits@10 0.888646288209607 MRR 0.6628587947596747 cur_rank 23 abs_cur_rank 23 total_num 457 500
438 100
checkcorrect 130 130 real score 2.5310797169804573 Hits@1 0.5272331154684096 Hits@3 0.7712418300653595 Hits@10 0.8867102396514162 MRR 0.6615599012344175 cur_rank 14 abs_cur_rank 14 total_num 458 500
0 2
checkcorrect 220 220 real score -0.10227697342634201 Hits@1 0.5260869565217391 Hits@3 0.7695652173913043 Hits@10 0.8847826086956522 MRR 0.6601290718369045 cur_rank 295 abs_cur_rank 295 total_num 459 500
438 100
checkcorrect 168 168 real score 19.80019474029541 Hits@1 0.5249457700650759 Hits@3 0.7678958785249458 Hits@10 0.8850325379609545 MRR 0.6591309610520087 cur_rank 4 abs_cur_rank 4 total_num 460 500
438 100
checkcorrect 34 34 real score 20.992277085781097 Hits@1 0.525974025974026 Hits@3 0.7683982683982684 Hits@10 0.8852813852813853 MRR 0.6598687728246235 cur_rank 0 abs_cu

438 100
checkcorrect 32 32 real score 17.135641276836395 Hits@1 0.5130260521042084 Hits@3 0.7615230460921844 Hits@10 0.8797595190380761 MRR 0.650370690347113 cur_rank 2 abs_cur_rank 2 total_num 498 500
438 100
checkcorrect 92 92 real score 20.985404789447784 Hits@1 0.514 Hits@3 0.762 Hits@10 0.88 MRR 0.6510699489664188 cur_rank 0 abs_cur_rank 0 total_num 499 500


#### Fine tuned

In [None]:
fine_tune = 'Yes'

In [None]:
#define the lists
x_p_list, x_r_list, y_list = {'1': [], '2': [], '3': []}, list(), list()

#######################################
###build the big-batches###############      

#fill in the training array list
build_big_batches(20000, 1, 10, Class_2, one_hop_ind, s_t_r_ind,
                      x_p_list, x_r_list, y_list,
                      relation2id, entity2id, id2relation, id2entity)

x_test_1 = np.asarray(x_p_list['1'], dtype='int')
x_test_2 = np.asarray(x_p_list['2'], dtype='int')
x_test_3 = np.asarray(x_p_list['3'], dtype='int')
x_test_r = np.asarray(x_r_list, dtype='int')
y_test = np.asarray(y_list, dtype='int')

if fine_tune == 'Yes':

    model.fit([x_test_1, x_test_2, x_test_3, x_test_r], y_test, batch_size=4, epochs=2)
    
else:
    
    model.evaluate([x_test_1, x_test_2, x_test_3, x_test_r], y_test, batch_size=4)

In [None]:
########################################################
#obtain the precision-recall area under curve (AUC-PR)##

#randomly select 10% of the triples
selected = random.sample(list(data_test), min(len(data_test), 500))

random.shuffle(selected)

###Hit at 1#############################
#generate the negative samples by randomly replace relation with all the other relaiton
Hits_at_1 = 0
Hits_at_3 = 0
Hits_at_10 = 0
MRR_raw = 0.

for i in range(len(selected)):
    
    s_true, r_true, t_true = selected[i][0], selected[i][1], selected[i][2]
    
    #first run the path-based scoring
    score_dict = path_based_relation_scoring(s_true, t_true, 1, 10, one_hop_ind, id2relation, model)
    
    #run the one-hop neighbour based scoring when not enough paths
    if len(score_dict) == 0:
        del(score_dict)
        score_dict = one_hop_nb_relation_scoring(s_true, t_true, one_hop_ind, id2relation, model_2)
    
    #[... [score, r], ...]
    temp_list = list()
    
    for r in id2relation:
        
        if r in score_dict:
            
            temp_list.append([score_dict[r], r])
            
        else:
            
            temp_list.append([0.0, r])
        
    sorted_list = sorted(temp_list, key = lambda x: x[0], reverse=True)
    
    p = 0
    exist_tri = 0
    
    while p < len(sorted_list) and sorted_list[p][1] != r_true:
        
        #moreover, we want to remove existing triples
        if ((s_true, sorted_list[p][1], t_true) in data_test) or (
            (s_true, sorted_list[p][1], t_true) in data_valid) or (
            (s_true, sorted_list[p][1], t_true) in data_ind) or (
            (s_true, sorted_list[p][1], t_true) in data) or (
            (s_true, sorted_list[p][1], t_true) in data_train_valid) or (
            (s_true, sorted_list[p][1], t_true) in data_train_test):
            
            exist_tri += 1
            
        p += 1
    
    if p - exist_tri == 0:
        
        Hits_at_1 += 1
        
    if p - exist_tri < 3:
        
        Hits_at_3 += 1
        
    if p - exist_tri < 10:
        
        Hits_at_10 += 1
        
    MRR_raw += 1./float(p - exist_tri + 1.) 
        
    print('checkcorrect', r_true, sorted_list[p][1],
          'real score', sorted_list[p][0],
          'Hits@1', Hits_at_1/(i+1),
          'Hits@3', Hits_at_3/(i+1),
          'Hits@10', Hits_at_10/(i+1),
          'MRR', MRR_raw/(i+1),
          'cur_rank', p - exist_tri,
          'abs_cur_rank', p,
          'total_num', i, len(selected))