### Train the inductive link prediction model

In [1]:
data_name = 'WN18RR_v4'
model_id = 'SiaLP_6_new'
lower_bound = 1
upper_bound_path = 10
upper_bound_subg = 3

In [2]:
#difine the names for saving
model_name = 'Model_' + model_id + '_' + data_name
one_hop_model_name = 'One_hop_model_' + model_id + '_' + data_name
ids_name = 'IDs_' + model_id + '_' + data_name

In [3]:
import librosa
import opensmile
import os
import sys
import numpy as np
import random
import pickle

from collections import defaultdict
from copy import deepcopy
from sklearn.utils import shuffle
from sys import getsizeof

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras import initializers
from tensorflow.keras.utils import plot_model

In [4]:
class LoadKG:
    
    def __init__(self):
        
        self.x = 'Hello'
        
    def load_train_data(self, data_path, one_hop, data, s_t_r, entity2id, id2entity,
                     relation2id, id2relation):
        
        data_ = set()
    
        ####load the train, valid and test set##########
        with open (data_path, 'r') as f:
            
            data_ini = f.readlines()
                        
            for i in range(len(data_ini)):
            
                x = data_ini[i].split()
                
                x_ = tuple(x)
                
                data_.add(x_)
        
        ####relation dict#################
        index = len(relation2id)
     
        for key in data_:
            
            if key[1] not in relation2id:
                
                relation = key[1]
                
                relation2id[relation] = index
                
                id2relation[index] = relation
                
                index += 1
                
                #the inverse relation
                iv_r = '_inverse_' + relation
                
                relation2id[iv_r] = index
                
                id2relation[index] = iv_r
                
                index += 1
        
        #get the id of the inverse relation, by above definition, initial relation has 
        #always even id, while inverse relation has always odd id.
        def inverse_r(r):
            
            if r % 2 == 0: #initial relation
                
                iv_r = r + 1
            
            else: #inverse relation
                
                iv_r = r - 1
            
            return(iv_r)
        
        ####entity dict###################
        index = len(entity2id)
        
        for key in data_:
            
            source, target = key[0], key[2]
            
            if source not in entity2id:
                                
                entity2id[source] = index
                
                id2entity[index] = source
                
                index += 1
            
            if target not in entity2id:
                
                entity2id[target] = index
                
                id2entity[index] = target
                
                index += 1
                
        #create the set of triples using id instead of string        
        for ele in data_:
            
            s = entity2id[ele[0]]
            
            r = relation2id[ele[1]]
            
            t = entity2id[ele[2]]
            
            if (s,r,t) not in data:
                
                data.add((s,r,t))
            
            s_t_r[(s,t)].add(r)
            
            if s not in one_hop:
                
                one_hop[s] = set()
            
            one_hop[s].add((r,t))
            
            if t not in one_hop:
                
                one_hop[t] = set()
            
            r_inv = inverse_r(r)
            
            s_t_r[(t,s)].add(r_inv)
            
            one_hop[t].add((r_inv,s))
            
        #change each set in one_hop to list
        for e in one_hop:
            
            one_hop[e] = list(one_hop[e])

In [5]:
class ObtainPathsByDynamicProgramming:

    def __init__(self, amount_bd=50, size_bd=50, threshold=20000):
        
        self.amount_bd = amount_bd #how many Tuples we choose in one_hop[node] for next recursion
                        
        self.size_bd = size_bd #size bound limit the number of paths to a target entity t
        
        #number of times paths with specific length been performed for recursion
        self.threshold = threshold
        
    '''
    Given an entity s, the function will find the paths from s to other entities, using recursion.
    
    One may refer to LeetCode Problem 797 for details:
        https://leetcode.com/problems/all-paths-from-source-to-target/
    '''
    def obtain_paths(self, mode, s, t_input, lower_bd, upper_bd, one_hop):

        if type(lower_bd) != type(1) or lower_bd < 1:
            
            raise TypeError("!!! invalid lower bound setting, must >= 1 !!!")
            
        if type(upper_bd) != type(1) or upper_bd < 1:
            
            raise TypeError("!!! invalid upper bound setting, must >= 1 !!!")
            
        if lower_bd > upper_bd:
            
            raise TypeError("!!! lower bound must not exced upper bound !!!")
            
        if s not in one_hop:
            
            raise ValueError('!!! entity not in one_hop. Please work on existing entities')

        #here is the result dict. Its key is each entity t sharing paths from s
        #The value of each t is a set containing the paths from s to t
        #These paths can be either the direct connection r, or a multi-hop path
        res = defaultdict(set)
        
        #qualified_t contains the types of t we want to consider,
        #that is, what t will be added to the result set.
        qualified_t = set()

        #under this mode, we will only consider the direct neighbour of s
        if mode == 'direct_neighbour':
        
            for Tuple in one_hop[s]:
            
                t = Tuple[1]
                
                qualified_t.add(t)
        
        #under this mode, we will only consider one specified entity t
        elif mode == 'target_specified':
            
            qualified_t.add(t_input)
        
        #under this mode, we will consider any entity
        elif mode == 'any_target':
            
            for s_any in one_hop:
                
                qualified_t.add(s_any)
                
        else:
            
            raise ValueError('not a valid mode')
        
        '''
        We use recursion to find the paths
        On current node with the path [r1, ..., rk] and on-path entities {s, e1, ..., ek-1, node}
        from s to this node, we will further find the direct neighbor t' of this node. 
        If t' is not an on-path entity (not among s, e1,...ek-1, node), we recursively proceed to t' 
        '''
        def helper(node, path, on_path_en, res, qualified_t, lower_bd, upper_bd, one_hop, count_dict):

            #when the current path is within lower_bd and upper_bd, 
            #and the node is among the qualified t, and it has not been fill of paths w.r.t size_limit,
            #we will add this path to the node
            if (len(path) >= lower_bd) and (len(path) <= upper_bd) and (
                node in qualified_t) and (len(res[node]) < self.size_bd):
                
                res[node].add(tuple(path))
                    
            #won't start new recursions if the current path length already reaches upper limit
            #or the number of recursions performed on this length has reached the limit
            if (len(path) < upper_bd) and (count_dict[len(path)] <= self.threshold):
                                
                #temp list is the id list for us to go-over one_hop[node]
                temp_list = [i for i in range(len(one_hop[node]))]
                random.shuffle(temp_list) #so we random-shuffle the list
                
                #only take 20 recursions if there are too many (r,t)
                for i in temp_list[:self.amount_bd]:
                    
                    #obtain tuple of (r,t)
                    Tuple = one_hop[node][i]
                    r, t = Tuple[0], Tuple[1]
                    
                    #add to count_dict even if eventually this step not proceed
                    count_dict[len(path)] += 1
                    
                    #if t not on the path and we not exceed the computation threshold, 
                    #then finally proceed to next recursion
                    if (t not in on_path_en) and (count_dict[len(path)] <= self.threshold):

                        helper(t, path + [r], on_path_en.union({t}), res, qualified_t, 
                               lower_bd, upper_bd, one_hop, count_dict)

        length_dict = defaultdict(int)
        count_dict = defaultdict(int)
        
        helper(s, [], {s}, res, qualified_t, lower_bd, upper_bd, one_hop, count_dict)
        
        return(res, count_dict)

In [6]:
train_path = '../data/' + data_name + '/train.txt'
valid_path = '../data/' + data_name + '/valid.txt'
test_path = '../data/' + data_name + '/test.txt'

In [7]:
#load the classes
Class_1 = LoadKG()
Class_2 = ObtainPathsByDynamicProgramming()

In [8]:
#define the dictionaries and sets for load KG
one_hop = dict() 
data = set()
s_t_r = defaultdict(set)

#define the dictionaries, which is shared by initail and inductive train/valid/test
entity2id = dict()
id2entity = dict()
relation2id = dict()
id2relation = dict()

#fill in the sets and dicts
Class_1.load_train_data(train_path, one_hop, data, s_t_r,
                        entity2id, id2entity, relation2id, id2relation)

In [9]:
#define the dictionaries and sets for load KG
one_hop_valid = dict() 
data_valid = set()
s_t_r_valid = defaultdict(set)

#fill in the sets and dicts
Class_1.load_train_data(valid_path, one_hop_valid, data_valid, s_t_r_valid,
                        entity2id, id2entity, relation2id, id2relation)

In [10]:
#define the dictionaries and sets for load KG
one_hop_test = dict() 
data_test = set()
s_t_r_test = defaultdict(set)

#fill in the sets and dicts
Class_1.load_train_data(test_path, one_hop_test, data_test, s_t_r_test,
                        entity2id, id2entity, relation2id, id2relation)

#### Build the path-based siamese neural network structure

We use biLSTM to train on the input path embedding sequence to predict the output embedding or the relation.

In [11]:
# Input layer, using integer to represent each relation type
#note that inputs_path is the path inputs, while inputs_out_re is the output relation inputs
fst_path = keras.Input(shape=(None,), dtype="int32")
scd_path = keras.Input(shape=(None,), dtype="int32")
thd_path = keras.Input(shape=(None,), dtype="int32")

#the relation input layer (for output embedding)
id_rela = keras.Input(shape=(None,), dtype="int32")

# Embed each integer in a 300-dimensional vector as input,
# note that we add another "space holder" embedding, 
# which hold the spaces if the initial length of paths are not the same
in_embd_var = layers.Embedding(len(relation2id)+1, 300)

# Obtain the embedding
fst_p_embd = in_embd_var(fst_path)
scd_p_embd = in_embd_var(scd_path)
thd_p_embd = in_embd_var(thd_path)

# Embed each integer in a 300-dimensional vector as output
rela_embd = layers.Embedding(len(relation2id)+1, 300)(id_rela)

#add 2 layer bi-directional LSTM
lstm_layer_1 = layers.Bidirectional(layers.LSTM(150, return_sequences=True))
lstm_layer_2 = layers.Bidirectional(layers.LSTM(150, return_sequences=True))

#first LSTM layer
fst_lstm_mid = lstm_layer_1(fst_p_embd)
scd_lstm_mid = lstm_layer_1(scd_p_embd)
thd_lstm_mid = lstm_layer_1(thd_p_embd)

#second LSTM layer
fst_lstm_out = lstm_layer_2(fst_lstm_mid)
scd_lstm_out = lstm_layer_2(scd_lstm_mid)
thd_lstm_out = lstm_layer_2(thd_lstm_mid)

#reduce max
fst_reduce_max = tf.reduce_max(fst_lstm_out, axis=1)
scd_reduce_max = tf.reduce_max(scd_lstm_out, axis=1)
thd_reduce_max = tf.reduce_max(thd_lstm_out, axis=1)

#concatenate the output vector from both siamese tunnel: (Batch, 900)
path_concat = layers.concatenate([fst_reduce_max, scd_reduce_max, thd_reduce_max], axis=-1)

#add dropout on top of the concatenation from all channels
dropout = layers.Dropout(0.25)(path_concat)

#multiply into output embd size by dense layer: (Batch, 300)
path_out_vect = layers.Dense(300, activation='tanh')(dropout)

#remove the time dimension from the output embd since there is only one step
rela_out_embd = tf.reduce_sum(rela_embd, axis=1)

# Normalize the vectors to have unit length
path_out_vect_norm = tf.math.l2_normalize(path_out_vect, axis=-1)
rela_out_embd_norm = tf.math.l2_normalize(rela_out_embd, axis=-1)

# Calculate the dot product
dot_product = layers.Dot(axes=-1)([path_out_vect_norm, rela_out_embd_norm])

#put together the model
model = keras.Model([fst_path, scd_path, thd_path, id_rela], dot_product)

2023-05-15 13:37:33.944270: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
#config the Adam optimizer 
opt = keras.optimizers.Adam(learning_rate=0.0005, decay=1e-6)

#compile the model
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['binary_accuracy'])

#### Build the subgraph-based siamese neural network

In [13]:
#each input is an vector with number of relations to be dim:
#each dim represent the existence (1) or not (0) of an out-going relation from the entity
source_path_1 = keras.Input(shape=(None,), dtype="int32")
source_path_2 = keras.Input(shape=(None,), dtype="int32")
source_path_3 = keras.Input(shape=(None,), dtype="int32")
source_path_4 = keras.Input(shape=(None,), dtype="int32")
source_path_5 = keras.Input(shape=(None,), dtype="int32")
source_path_6 = keras.Input(shape=(None,), dtype="int32")

target_path_1 = keras.Input(shape=(None,), dtype="int32")
target_path_2 = keras.Input(shape=(None,), dtype="int32")
target_path_3 = keras.Input(shape=(None,), dtype="int32")
target_path_4 = keras.Input(shape=(None,), dtype="int32")
target_path_5 = keras.Input(shape=(None,), dtype="int32")
target_path_6 = keras.Input(shape=(None,), dtype="int32")

#the relation input layer (for output embedding)
id_rela_ = keras.Input(shape=(None,), dtype="int32")

# Embed each integer in a 300-dimensional vector as input,
# note that we add another "space holder" embedding, 
# which hold the spaces if the initial length of paths are not the same
#the source and target embedding and separate
in_embd_var_ = layers.Embedding(len(relation2id)+1, 300)

# Obtain the source embeddings
source_embd_1 = in_embd_var_(source_path_1)
source_embd_2 = in_embd_var_(source_path_2)
source_embd_3 = in_embd_var_(source_path_3)
source_embd_4 = in_embd_var_(source_path_4)
source_embd_5 = in_embd_var_(source_path_5)
source_embd_6 = in_embd_var_(source_path_6)

#Obtain the target embeddings
target_embd_1 = in_embd_var_(target_path_1)
target_embd_2 = in_embd_var_(target_path_2)
target_embd_3 = in_embd_var_(target_path_3)
target_embd_4 = in_embd_var_(target_path_4)
target_embd_5 = in_embd_var_(target_path_5)
target_embd_6 = in_embd_var_(target_path_6)

# Embed each integer in a 300-dimensional vector as output
rela_embd_ = layers.Embedding(len(relation2id)+1, 300)(id_rela_)

#add 2 layer bi-directional LSTM network
lstm_1 = layers.Bidirectional(layers.LSTM(150, return_sequences=True))
lstm_2 = layers.Bidirectional(layers.LSTM(150, return_sequences=True))

###source lstm implimentation########
#first LSTM layer
source_mid_1 = lstm_1(source_embd_1)
source_mid_2 = lstm_1(source_embd_2)
source_mid_3 = lstm_1(source_embd_3)
source_mid_4 = lstm_1(source_embd_4)
source_mid_5 = lstm_1(source_embd_5)
source_mid_6 = lstm_1(source_embd_6)

#second LSTM layer
source_out_1 = lstm_2(source_mid_1)
source_out_2 = lstm_2(source_mid_2)
source_out_3 = lstm_2(source_mid_3)
source_out_4 = lstm_2(source_mid_4)
source_out_5 = lstm_2(source_mid_5)
source_out_6 = lstm_2(source_mid_6)

#reduce max
source_max_1 = tf.reduce_max(source_out_1, axis=1)
source_max_2 = tf.reduce_max(source_out_2, axis=1)
source_max_3 = tf.reduce_max(source_out_3, axis=1)
source_max_4 = tf.reduce_max(source_out_4, axis=1)
source_max_5 = tf.reduce_max(source_out_5, axis=1)
source_max_6 = tf.reduce_max(source_out_6, axis=1)

#concatenate the output vector from both siamese tunnel: (Batch, 900)
source_concat = layers.concatenate([source_max_1, 
                                    source_max_2, 
                                    source_max_3,
                                    source_max_4,
                                    source_max_5,
                                    source_max_6], axis=-1)

#add dropout on top of the concatenation from all channels
source_dropout = layers.Dropout(0.25)(source_concat)

###target lstm implimentation########
#first LSTM layer
target_mid_1 = lstm_1(target_embd_1)
target_mid_2 = lstm_1(target_embd_2)
target_mid_3 = lstm_1(target_embd_3)
target_mid_4 = lstm_1(target_embd_4)
target_mid_5 = lstm_1(target_embd_5)
target_mid_6 = lstm_1(target_embd_6)

#second LSTM layer
target_out_1 = lstm_2(target_mid_1)
target_out_2 = lstm_2(target_mid_2)
target_out_3 = lstm_2(target_mid_3)
target_out_4 = lstm_2(target_mid_4)
target_out_5 = lstm_2(target_mid_5)
target_out_6 = lstm_2(target_mid_6)

#reduce max
target_max_1 = tf.reduce_max(target_out_1, axis=1)
target_max_2 = tf.reduce_max(target_out_2, axis=1)
target_max_3 = tf.reduce_max(target_out_3, axis=1)
target_max_4 = tf.reduce_max(target_out_4, axis=1)
target_max_5 = tf.reduce_max(target_out_5, axis=1)
target_max_6 = tf.reduce_max(target_out_6, axis=1)

#concatenate the output vector from both siamese tunnel: (Batch, 900)
target_concat = layers.concatenate([target_max_1, 
                                    target_max_2, 
                                    target_max_3,
                                    target_max_4,
                                    target_max_5,
                                    target_max_6], axis=-1)

#add dropout on top of the concatenation from all channels
target_dropout = layers.Dropout(0.25)(target_concat)

#further concatenate source and target output embeddings: (Batch, 1800)
final_concat = layers.concatenate([source_dropout, target_dropout], axis=-1)

#multiply into output embd size by dense layer: (Batch, 300)
out_vect = layers.Dense(300, activation='tanh')(final_concat)

#remove the time dimension from the output embd since there is only one step
rela_out_embd_ = tf.reduce_sum(rela_embd_, axis=1)

# Normalize the vectors to have unit length
out_vect_norm = tf.math.l2_normalize(out_vect, axis=-1)
rela_out_embd_norm_ = tf.math.l2_normalize(rela_out_embd_, axis=-1)

# Calculate the dot product
dot_product_ = layers.Dot(axes=-1)([out_vect_norm, rela_out_embd_norm_])

#put together the model
model_2 = keras.Model([source_path_1, source_path_2, source_path_3, source_path_4,
                       source_path_5, source_path_6,
                       target_path_1, target_path_2, target_path_3, target_path_4, 
                       target_path_5, target_path_6,
                       id_rela_], dot_product_)

In [14]:
#config the Adam optimizer 
opt_ = keras.optimizers.Adam(learning_rate=0.0005, decay=1e-6)

#compile the model
model_2.compile(loss='binary_crossentropy', optimizer=opt_, metrics=['binary_accuracy'])

### Build the big-batch for path-based model
We will build the big-batch for the path-based model training. That is, we will build three list to store three paths, respectively.

In order to reduce computational complexity, we will run the path-finding algorithm for each entity e in the dataset before the training. That is, for each entity e, we will have two dictionaries. Dict 1 stores the paths between e and any other entities in the dataset. Will Dict 2 stores the paths between e and its direct neighbors. The two dicts will be used and invariant throughout the training.

* At each step, three different paths between two entities s and t are selected. Each path is append to one of the list. 
* If this step is for positive samples, the existing relation r will be selected between s and t. If there are more than one relation from s to t, we randomly choose one. Also, the label list will be appended 1.
* If this step is for negative samples, one relation that does not exist between s and t will be selected randomly and append to the relation list. Also, the label list will be appended 0.
* In practice, the positive step is always fallowed by a negative step. The same paths in the positive step will be used in the next negative step, while the relation is a negative one chosen in the above way.
* We do this until the length limit is reached.

**For relation prediciton, we will only need to train using (s,r,t) triple. (t,r-1,s) is not necessary and hence not included in training.**

In [15]:
#function to build the big batche for path-based training
def build_big_batches_path(lower_bd, upper_bd, data, one_hop, s_t_r,
                      x_p_list, x_r_list, y_list,
                      relation2id, entity2id, id2relation, id2entity):
    
    #the set of all relation IDs
    relation_id_set = set()
    
    #the set of all initial relations
    ini_r_id_set = set()
    
    for i in range(len(id2relation)):
        
        if i not in id2relation:
            raise ValueError('error when generaing id2relation')
        
        relation_id_set.add(i)
        
        if i % 2 == 0: #initial relation id is always an even number
            ini_r_id_set.add(i)
    
    num_r = len(id2relation)
    num_ini_r = len(ini_r_id_set)
    
    if num_ini_r != int(num_r/2):
        raise ValueError('error when generating id2relation')
    
    #in case not all entities in entity2id are in one_hop, 
    #so we need to find out who are indeed in
    existing_ids = set()
    
    for s_1 in one_hop:
        existing_ids.add(s_1)
        
    existing_ids = list(existing_ids)
    random.shuffle(existing_ids)
    
    count = 0
    for s in existing_ids:
        
        #impliment the path finding algorithm to find paths between s and t
        result, length_dict = Class_2.obtain_paths('direct_neighbour', s, 'nb', lower_bd, upper_bd, one_hop)
        
        for iteration in range(10):

            #proceed only if at least three paths are between s and t
            for t in result:

                if len(s_t_r[(s,t)]) == 0:

                    raise ValueError(s,t,id2entity[s], id2entity[t])

                #we are only interested in forward link in relation prediciton
                ini_r_list = list()

                #obtain initial relations between s and t
                for r in s_t_r[(s,t)]:
                    if r % 2 == 0:#initial relation id is always an even number
                        ini_r_list.append(r)

                #if there exist more than three paths between s and t, 
                #and inital connection between s and t exists,
                #and not every r in the relation dictionary exists between s and t (although this is rare)
                #we then proceed
                if len(result[t]) >= 3 and len(ini_r_list) > 0 and len(ini_r_list) < int(num_ini_r):

                    #obtain the list form of all the paths from s to t
                    temp_path_list = list(result[t])

                    temp_pair = random.sample(temp_path_list, 3)

                    path_1, path_2, path_3 = temp_pair[0], temp_pair[1], temp_pair[2]

                    #####positive#####################
                    #append the paths: note that we add the space holder id at the end of the shorter path
                    x_p_list['1'].append(list(path_1) + [num_r]*abs(len(path_1)-upper_bd))
                    x_p_list['2'].append(list(path_2) + [num_r]*abs(len(path_2)-upper_bd))
                    x_p_list['3'].append(list(path_3) + [num_r]*abs(len(path_3)-upper_bd))

                    #append relation
                    r = random.choice(ini_r_list)
                    x_r_list.append([r])
                    y_list.append(1.)

                    #####negative#####################
                    #append the paths: note that we add the space holder id at the end
                    #of the shorter path
                    x_p_list['1'].append(list(path_1) + [num_r]*abs(len(path_1)-upper_bd))
                    x_p_list['2'].append(list(path_2) + [num_r]*abs(len(path_2)-upper_bd))
                    x_p_list['3'].append(list(path_3) + [num_r]*abs(len(path_3)-upper_bd))

                    #append relation
                    neg_r_list = list(ini_r_id_set.difference(set(ini_r_list)))
                    r_ran = random.choice(neg_r_list)
                    x_r_list.append([r_ran])
                    y_list.append(0.)
        
        count += 1
        if count % 100 == 0:
            print('generating big-batches for path-based model', count, len(existing_ids))

### Build the big-batch for the subgraph-based network training

Again, to reduce computational complexity, we store the subgraph of each entity e at the biginning.

* At each step, we will select one triple (s,r,t) from the dataset. Then, reaching out paths of s and t is generated respectively according to their out-going relations.
* We will select three paths for each of source and target entity. Add them to the corresponding list.
* If this is a positive sample step, the id of relation r is appended to the relation list.
* If this is a negative sample step, the id of a random relation is appended to the relation lsit.
* Similarly, one negative sample step always follows one positive step. The one-hop vectors from the previous positve step is used again for the negative step.

In [16]:
#Again, it is too slow to run the path-finding algorithm again and again on the complete FB15K-237
#Instead, we will find the subgraph for each entity once.
#then in the subgraph based training, the subgraphs are stored and used for multiple times
def store_subgraph_dicts(lower_bd, upper_bd, data, one_hop, s_t_r,
                         relation2id, entity2id, id2relation, id2entity):
    
    #the set of all relation IDs
    relation_id_set = set()
    
    for i in range(len(id2relation)):
        
        if i not in id2relation:
            raise ValueError('error when generaing id2relation')
        
        relation_id_set.add(i)
    
    num_r = len(id2relation)
    
    #in case not all entities in entity2id are in one_hop, 
    #so we need to find out who are indeed in
    existing_ids = set()
    
    for s_1 in one_hop:
        existing_ids.add(s_1)
    
    #the ids to start path finding
    existing_ids = list(existing_ids)
    random.shuffle(existing_ids)
    
    #Dict stores the subgraph for each entity
    Dict_1 = dict()
    
    count = 0
    for s in existing_ids:
        
        path_set = set()
            
        result, length_dict = Class_2.obtain_paths('any_target', s, 'any', lower_bd, upper_bd, one_hop)

        for t_ in result:
            for path in result[t_]:
                path_set.add(path)

        del(result, length_dict)
        
        path_list = list(path_set)
        
        path_select = random.sample(path_list, min(len(path_list), 100))
            
        Dict_1[s] = deepcopy(path_select)
        
        count += 1
        if count % 100 == 0:
            print('generating and storing paths for the path-based model', count, len(existing_ids))
        
    return(Dict_1)

In [17]:
#function to build the big-batch for one-hope neighbor training
def build_big_batches_subgraph(lower_bd, upper_bd, data, one_hop, s_t_r,
                      x_s_list, x_t_list, x_r_list, y_list, Dict,
                      relation2id, entity2id, id2relation, id2entity):
    
    #the set of all relation IDs
    relation_id_set = set()
    
    #the set of all initial relations
    ini_r_id_set = set()
    
    for i in range(len(id2relation)):
        
        if i not in id2relation:
            raise ValueError('error when generaing id2relation')
        
        relation_id_set.add(i)
        
        if i % 2 == 0: #initial relation id is always an even number
            ini_r_id_set.add(i)
    
    num_r = len(id2relation)
    num_ini_r = len(ini_r_id_set)
    
    if num_ini_r != int(num_r/2):
        raise ValueError('error when generating id2relation')
        
    #if an entity has at least three out-stretching paths, it is a qualified one
    qualified = set()
    for e in Dict:
        if len(Dict[e]) >= 6:
            qualified.add(e)
    qualified = list(qualified)
    
    data = list(data)
    
    for iteration in range(10):

        data = shuffle(data)

        for i_0 in range(len(data)):

            triple = data[i_0]

            s, r, t = triple[0], triple[1], triple[2] #obtain entities and relation IDs

            if s in qualified and t in qualified:

                #obtain the path list for true entities
                path_s, path_t = list(Dict[s]), list(Dict[t])

                #####positive step###########
                #randomly obtain three paths for true entities
                temp_s = random.sample(path_s, 6)
                temp_t = random.sample(path_t, 6)
                s_p_1, s_p_2, s_p_3, s_p_4, s_p_5, s_p_6 = temp_s[0], temp_s[1], temp_s[2], temp_s[3], temp_s[4], temp_s[5]
                t_p_1, t_p_2, t_p_3, t_p_4, t_p_5, t_p_6 = temp_t[0], temp_t[1], temp_t[2], temp_t[3], temp_t[4], temp_t[5]

                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                x_r_list.append([r])
                y_list.append(1.)

                #####negative step for relation###########
                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                neg_r_list = list(ini_r_id_set.difference({r}))
                r_ran = random.choice(neg_r_list)
                x_r_list.append([r_ran])
                y_list.append(0.)
                
                ##############################################
                ##############################################
                #randomly choose two negative sampled entities
                s_ran = random.choice(qualified)
                t_ran = random.choice(qualified)

                #obtain the path list for random entities
                path_s_ran, path_t_ran = list(Dict[s_ran]), list(Dict[t_ran])
                
                #####positive step#################
                #Again: randomly obtain three paths
                temp_s = random.sample(path_s, 6)
                temp_t = random.sample(path_t, 6)
                s_p_1, s_p_2, s_p_3, s_p_4, s_p_5, s_p_6 = temp_s[0], temp_s[1], temp_s[2], temp_s[3], temp_s[4], temp_s[5]
                t_p_1, t_p_2, t_p_3, t_p_4, t_p_5, t_p_6 = temp_t[0], temp_t[1], temp_t[2], temp_t[3], temp_t[4], temp_t[5]

                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                x_r_list.append([r])
                y_list.append(1.)

                #####negative for source entity###########
                #randomly obtain three paths
                temp_s = random.sample(path_s_ran, 6)
                s_p_1, s_p_2, s_p_3, s_p_4, s_p_5, s_p_6 = temp_s[0], temp_s[1], temp_s[2], temp_s[3], temp_s[4], temp_s[5]

                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                x_r_list.append([r])
                y_list.append(0.)

                #####positive step###########
                #Again: randomly obtain three paths
                temp_s = random.sample(path_s, 6)
                temp_t = random.sample(path_t, 6)
                s_p_1, s_p_2, s_p_3, s_p_4, s_p_5, s_p_6 = temp_s[0], temp_s[1], temp_s[2], temp_s[3], temp_s[4], temp_s[5]
                t_p_1, t_p_2, t_p_3, t_p_4, t_p_5, t_p_6 = temp_t[0], temp_t[1], temp_t[2], temp_t[3], temp_t[4], temp_t[5]

                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                x_r_list.append([r])
                y_list.append(1.)

                #####negative for target entity###########
                #randomly obtain three paths
                temp_t = random.sample(path_t_ran, 6)
                t_p_1, t_p_2, t_p_3, t_p_4, t_p_5, t_p_6 = temp_t[0], temp_t[1], temp_t[2], temp_t[3], temp_t[4], temp_t[5]

                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                x_r_list.append([r])
                y_list.append(0.)

            if i_0 % 2000 == 0:
                print('generating big-batches for subgraph-based model', i_0, len(data), iteration)

### Start Training: load the KG and call classes

Here, we use the validation set to see the training efficiency. That is, we use the validation to check whether the true relation between entities can be predicted by paths.

The trick is: in validation, we have to use the same relation ID and entity ID as in the training. But we don't want to use the links in training anymore. That is, in validation, we want to use (and update if necessary) entity2id, id2entity, relation2id and id2relation. But we want to use new one_hop, data, data_ and s_t_r for validation set. Then, path-finding will also be based on new one_hop.


In [18]:
model_name

'Model_SiaLP_6_new_WN18RR_v4'

In [19]:
one_hop_model_name

'One_hop_model_SiaLP_6_new_WN18RR_v4'

In [20]:
ids_name

'IDs_SiaLP_6_new_WN18RR_v4'

In [21]:
#first, we save the relation and ids
Dict = dict()

#save training data
Dict['one_hop'] = one_hop
Dict['data'] = data
Dict['s_t_r'] = s_t_r

#save valid data
Dict['one_hop_valid'] = one_hop_valid
Dict['data_valid'] = data_valid
Dict['s_t_r_valid'] = s_t_r_valid

#save test data
Dict['one_hop_test'] = one_hop_test
Dict['data_test'] = data_test
Dict['s_t_r_test'] = s_t_r_test

#save shared dictionaries
Dict['entity2id'] = entity2id
Dict['id2entity'] = id2entity
Dict['relation2id'] = relation2id
Dict['id2relation'] = id2relation

with open('../weight_bin/' + ids_name + '.pickle', 'wb') as handle:
    pickle.dump(Dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
###train the path-based model
lower_bd = lower_bound
upper_bd = upper_bound_path
num_epoch = 10
batch_size = 32
        
#define the training lists
train_p_list, train_r_list, train_y_list = {'1': [], '2': [], '3': []}, list(), list()

#define the validation lists
valid_p_list, valid_r_list, valid_y_list = {'1': [], '2': [], '3': []}, list(), list()

#######################################
###build the big-batches###############      

#fill in the training array list
build_big_batches_path(lower_bd, upper_bd, data, one_hop, s_t_r,
                      train_p_list, train_r_list, train_y_list,
                      relation2id, entity2id, id2relation, id2entity)

#fill in the validation array list
build_big_batches_path(lower_bd, upper_bd, data_valid, one_hop_valid, s_t_r_valid,
                      valid_p_list, valid_r_list, valid_y_list,
                      relation2id, entity2id, id2relation, id2entity)    

#######################################
###do the training#####################
#sometimes the validation dataset is so small so sparse, 
#which cannot find three paths between any pair of s and t.
#in such a case, we will divide the training big-batch into train and valid
if len(valid_y_list) >= 100:
    #generate the input arrays
    x_train_1 = np.asarray(train_p_list['1'], dtype='int')
    x_train_2 = np.asarray(train_p_list['2'], dtype='int')
    x_train_3 = np.asarray(train_p_list['3'], dtype='int')
    x_train_r = np.asarray(train_r_list, dtype='int')
    y_train = np.asarray(train_y_list, dtype='int')

    #generate the validation arrays
    x_valid_1 = np.asarray(valid_p_list['1'], dtype='int')
    x_valid_2 = np.asarray(valid_p_list['2'], dtype='int')
    x_valid_3 = np.asarray(valid_p_list['3'], dtype='int')
    x_valid_r = np.asarray(valid_r_list, dtype='int')
    y_valid = np.asarray(valid_y_list, dtype='int')

else:
    split = int(len(train_y_list)*0.8)
    #generate the input arrays
    x_train_1 = np.asarray(train_p_list['1'][:split], dtype='int')
    x_train_2 = np.asarray(train_p_list['2'][:split], dtype='int')
    x_train_3 = np.asarray(train_p_list['3'][:split], dtype='int')
    x_train_r = np.asarray(train_r_list[:split], dtype='int')
    y_train = np.asarray(train_y_list[:split], dtype='int')

    #generate the validation arrays
    x_valid_1 = np.asarray(train_p_list['1'][split:], dtype='int')
    x_valid_2 = np.asarray(train_p_list['2'][split:], dtype='int')
    x_valid_3 = np.asarray(train_p_list['3'][split:], dtype='int')
    x_valid_r = np.asarray(train_r_list[split:], dtype='int')
    y_valid = np.asarray(train_y_list[split:], dtype='int')

#do the training
model.fit([x_train_1, x_train_2, x_train_3, x_train_r], y_train, 
          validation_data=([x_valid_1, x_valid_2, x_valid_3, x_valid_r], y_valid),
          batch_size=batch_size, epochs=num_epoch)   

# Save model and weights
add_h5 = model_name + '.h5'
save_dir = os.path.join(os.getcwd(), '../weight_bin')

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, add_h5)
model.save(model_path)
print('Save model')
del(model)

generating big-batches for path-based model 100 3861
generating big-batches for path-based model 200 3861
generating big-batches for path-based model 300 3861
generating big-batches for path-based model 400 3861
generating big-batches for path-based model 500 3861
generating big-batches for path-based model 600 3861
generating big-batches for path-based model 700 3861
generating big-batches for path-based model 800 3861
generating big-batches for path-based model 900 3861
generating big-batches for path-based model 1000 3861
generating big-batches for path-based model 1100 3861
generating big-batches for path-based model 1200 3861
generating big-batches for path-based model 1300 3861
generating big-batches for path-based model 1400 3861
generating big-batches for path-based model 1500 3861
generating big-batches for path-based model 1600 3861
generating big-batches for path-based model 1700 3861
generating big-batches for path-based model 1800 3861
generating big-batches for path-based

In [23]:
###train the subgraph-based model
lower_bd = lower_bound
upper_bd = upper_bound_subg
num_epoch = 10
batch_size = 32

Dict_train = store_subgraph_dicts(lower_bd, upper_bd, data, one_hop, s_t_r,
                         relation2id, entity2id, id2relation, id2entity)

Dict_valid = store_subgraph_dicts(lower_bd, upper_bd, data_valid, one_hop_valid, s_t_r_valid,
                         relation2id, entity2id, id2relation, id2entity)
        
#define the training lists
train_s_list, train_t_list, train_r_list, train_y_list = {'1': [], '2': [], '3': [], '4': [], '5': [], '6': []}, {'1': [], '2': [], '3': [], '4': [], '5': [], '6': []}, list(), list()

#define the validation lists
valid_s_list, valid_t_list, valid_r_list, valid_y_list = {'1': [], '2': [], '3': [], '4': [], '5': [], '6': []}, {'1': [], '2': [], '3': [], '4': [], '5': [], '6': []}, list(), list()

#######################################
###build the big-batches###############      

#fill in the training array list
build_big_batches_subgraph(lower_bd, upper_bd, data, one_hop, s_t_r,
                      train_s_list, train_t_list, train_r_list, train_y_list, Dict_train,
                      relation2id, entity2id, id2relation, id2entity)

#fill in the validation array list
build_big_batches_subgraph(lower_bd, upper_bd, data_valid, one_hop_valid, s_t_r_valid,
                      valid_s_list, valid_t_list, valid_r_list, valid_y_list, Dict_valid,
                      relation2id, entity2id, id2relation, id2entity)    

#######################################
###do the training#####################
#sometimes the validation dataset is so small so sparse, 
#which cannot find three paths between any pair of s and t.
#in such a case, we will divide the training big-batch into train and valid
if len(valid_y_list) >= 100:
    #generate the input arrays
    x_train_s_1 = np.asarray(train_s_list['1'], dtype='int')
    x_train_s_2 = np.asarray(train_s_list['2'], dtype='int')
    x_train_s_3 = np.asarray(train_s_list['3'], dtype='int')
    x_train_s_4 = np.asarray(train_s_list['4'], dtype='int')
    x_train_s_5 = np.asarray(train_s_list['5'], dtype='int')
    x_train_s_6 = np.asarray(train_s_list['6'], dtype='int')

    x_train_t_1 = np.asarray(train_t_list['1'], dtype='int')
    x_train_t_2 = np.asarray(train_t_list['2'], dtype='int')
    x_train_t_3 = np.asarray(train_t_list['3'], dtype='int')
    x_train_t_4 = np.asarray(train_t_list['4'], dtype='int')
    x_train_t_5 = np.asarray(train_t_list['5'], dtype='int')
    x_train_t_6 = np.asarray(train_t_list['6'], dtype='int')

    x_train_r = np.asarray(train_r_list, dtype='int')
    y_train = np.asarray(train_y_list, dtype='int')

    #generate the validation arrays
    x_valid_s_1 = np.asarray(valid_s_list['1'], dtype='int')
    x_valid_s_2 = np.asarray(valid_s_list['2'], dtype='int')
    x_valid_s_3 = np.asarray(valid_s_list['3'], dtype='int')
    x_valid_s_4 = np.asarray(valid_s_list['4'], dtype='int')
    x_valid_s_5 = np.asarray(valid_s_list['5'], dtype='int')
    x_valid_s_6 = np.asarray(valid_s_list['6'], dtype='int')

    x_valid_t_1 = np.asarray(valid_t_list['1'], dtype='int')
    x_valid_t_2 = np.asarray(valid_t_list['2'], dtype='int')
    x_valid_t_3 = np.asarray(valid_t_list['3'], dtype='int')
    x_valid_t_4 = np.asarray(valid_t_list['4'], dtype='int')
    x_valid_t_5 = np.asarray(valid_t_list['5'], dtype='int')
    x_valid_t_6 = np.asarray(valid_t_list['6'], dtype='int')

    x_valid_r = np.asarray(valid_r_list, dtype='int')
    y_valid = np.asarray(valid_y_list, dtype='int')

else:
    split = int(len(train_y_list)*0.8)
    #generate the input arrays
    x_train_s_1 = np.asarray(train_s_list['1'][:split], dtype='int')
    x_train_s_2 = np.asarray(train_s_list['2'][:split], dtype='int')
    x_train_s_3 = np.asarray(train_s_list['3'][:split], dtype='int')
    x_train_s_4 = np.asarray(train_s_list['4'][:split], dtype='int')
    x_train_s_5 = np.asarray(train_s_list['5'][:split], dtype='int')
    x_train_s_6 = np.asarray(train_s_list['6'][:split], dtype='int')

    x_train_t_1 = np.asarray(train_t_list['1'][:split], dtype='int')
    x_train_t_2 = np.asarray(train_t_list['2'][:split], dtype='int')
    x_train_t_3 = np.asarray(train_t_list['3'][:split], dtype='int')
    x_train_t_4 = np.asarray(train_t_list['4'][:split], dtype='int')
    x_train_t_5 = np.asarray(train_t_list['5'][:split], dtype='int')
    x_train_t_6 = np.asarray(train_t_list['6'][:split], dtype='int')

    x_train_r = np.asarray(train_r_list[:split], dtype='int')
    y_train = np.asarray(train_y_list[:split], dtype='int')

    #generate the validation arrays
    x_valid_s_1 = np.asarray(train_s_list['1'][split:], dtype='int')
    x_valid_s_2 = np.asarray(train_s_list['2'][split:], dtype='int')
    x_valid_s_3 = np.asarray(train_s_list['3'][split:], dtype='int')
    x_valid_s_4 = np.asarray(train_s_list['4'][split:], dtype='int')
    x_valid_s_5 = np.asarray(train_s_list['5'][split:], dtype='int')
    x_valid_s_6 = np.asarray(train_s_list['6'][split:], dtype='int')

    x_valid_t_1 = np.asarray(train_t_list['1'][split:], dtype='int')
    x_valid_t_2 = np.asarray(train_t_list['2'][split:], dtype='int')
    x_valid_t_3 = np.asarray(train_t_list['3'][split:], dtype='int')
    x_valid_t_4 = np.asarray(train_t_list['4'][split:], dtype='int')
    x_valid_t_5 = np.asarray(train_t_list['5'][split:], dtype='int')
    x_valid_t_6 = np.asarray(train_t_list['6'][split:], dtype='int')

    x_valid_r = np.asarray(train_r_list[split:], dtype='int')
    y_valid = np.asarray(train_y_list[split:], dtype='int')

#do the training
model_2.fit([x_train_s_1, x_train_s_2, x_train_s_3, x_train_s_4, x_train_s_5, x_train_s_6,
             x_train_t_1, x_train_t_2, x_train_t_3, x_train_t_4, x_train_t_5, x_train_t_6,
             x_train_r], y_train, 
          validation_data=([x_valid_s_1, x_valid_s_2, x_valid_s_3, x_valid_s_4, x_valid_s_5, x_valid_s_6,
                            x_valid_t_1, x_valid_t_2, x_valid_t_3, x_valid_t_4, x_valid_t_5, x_valid_t_6,
                            x_valid_r], y_valid),
          batch_size=batch_size, epochs=num_epoch)

# Save model and weights
one_hop_add_h5 = one_hop_model_name + '.h5'
one_hop_save_dir = os.path.join(os.getcwd(), '../weight_bin')

if not os.path.isdir(one_hop_save_dir):
    os.makedirs(one_hop_save_dir)
one_hop_model_path = os.path.join(one_hop_save_dir, one_hop_add_h5)
model_2.save(one_hop_model_path)
print('Save model')
del(model_2, Dict_train, Dict_valid)

generating and storing paths for the path-based model 100 3861
generating and storing paths for the path-based model 200 3861
generating and storing paths for the path-based model 300 3861
generating and storing paths for the path-based model 400 3861
generating and storing paths for the path-based model 500 3861
generating and storing paths for the path-based model 600 3861
generating and storing paths for the path-based model 700 3861
generating and storing paths for the path-based model 800 3861
generating and storing paths for the path-based model 900 3861
generating and storing paths for the path-based model 1000 3861
generating and storing paths for the path-based model 1100 3861
generating and storing paths for the path-based model 1200 3861
generating and storing paths for the path-based model 1300 3861
generating and storing paths for the path-based model 1400 3861
generating and storing paths for the path-based model 1500 3861
generating and storing paths for the path-based m

### Result on the testset for inductive link prediction

We use the testset for inductive link prediction.

In [1]:
data_name = 'fb237_v2'
model_id = 'SiaLP_6_new'
lower_bound = 1
upper_bound_path = 10
upper_bound_subg = 3

In [2]:
#difine the names for saving
model_name = 'Model_' + model_id + '_' + data_name
one_hop_model_name = 'One_hop_model_' + model_id + '_' + data_name
ids_name = 'IDs_' + model_id + '_' + data_name

In [3]:
ids_name

'IDs_SiaLP_6_new_fb237_v2'

In [4]:
one_hop_model_name

'One_hop_model_SiaLP_6_new_fb237_v2'

In [5]:
model_name

'Model_SiaLP_6_new_fb237_v2'

In [6]:
import librosa
import opensmile
import os
import sys
import numpy as np
import random
import pickle

from collections import defaultdict
from copy import deepcopy
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras import initializers
from tensorflow.keras.utils import plot_model

In [7]:
class LoadKG:
    
    def __init__(self):
        
        self.x = 'Hello'
        
    def load_train_data(self, data_path, one_hop, data, s_t_r, entity2id, id2entity,
                     relation2id, id2relation):
        
        data_ = set()
    
        ####load the train, valid and test set##########
        with open (data_path, 'r') as f:
            
            data_ini = f.readlines()
                        
            for i in range(len(data_ini)):
            
                x = data_ini[i].split()
                
                x_ = tuple(x)
                
                data_.add(x_)
        
        ####relation dict#################
        index = len(relation2id)
     
        for key in data_:
            
            if key[1] not in relation2id:
                
                relation = key[1]
                
                relation2id[relation] = index
                
                id2relation[index] = relation
                
                index += 1
                
                #the inverse relation
                iv_r = '_inverse_' + relation
                
                relation2id[iv_r] = index
                
                id2relation[index] = iv_r
                
                index += 1
        
        #get the id of the inverse relation, by above definition, initial relation has 
        #always even id, while inverse relation has always odd id.
        def inverse_r(r):
            
            if r % 2 == 0: #initial relation
                
                iv_r = r + 1
            
            else: #inverse relation
                
                iv_r = r - 1
            
            return(iv_r)
        
        ####entity dict###################
        index = len(entity2id)
        
        for key in data_:
            
            source, target = key[0], key[2]
            
            if source not in entity2id:
                                
                entity2id[source] = index
                
                id2entity[index] = source
                
                index += 1
            
            if target not in entity2id:
                
                entity2id[target] = index
                
                id2entity[index] = target
                
                index += 1
                
        #create the set of triples using id instead of string        
        for ele in data_:
            
            s = entity2id[ele[0]]
            
            r = relation2id[ele[1]]
            
            t = entity2id[ele[2]]
            
            if (s,r,t) not in data:
                
                data.add((s,r,t))
            
            s_t_r[(s,t)].add(r)
            
            if s not in one_hop:
                
                one_hop[s] = set()
            
            one_hop[s].add((r,t))
            
            if t not in one_hop:
                
                one_hop[t] = set()
            
            r_inv = inverse_r(r)
            
            s_t_r[(t,s)].add(r_inv)
            
            one_hop[t].add((r_inv,s))
            
        #change each set in one_hop to list
        for e in one_hop:
            
            one_hop[e] = list(one_hop[e])

In [8]:
class ObtainPathsByDynamicProgramming:

    def __init__(self, amount_bd=50, size_bd=50, threshold=20000):
        
        self.amount_bd = amount_bd #how many Tuples we choose in one_hop[node] for next recursion
                        
        self.size_bd = size_bd #size bound limit the number of paths to a target entity t
        
        #number of times paths with specific length been performed for recursion
        self.threshold = threshold
        
    '''
    Given an entity s, the function will find the paths from s to other entities, using recursion.
    
    One may refer to LeetCode Problem 797 for details:
        https://leetcode.com/problems/all-paths-from-source-to-target/
    '''
    def obtain_paths(self, mode, s, t_input, lower_bd, upper_bd, one_hop):

        if type(lower_bd) != type(1) or lower_bd < 1:
            
            raise TypeError("!!! invalid lower bound setting, must >= 1 !!!")
            
        if type(upper_bd) != type(1) or upper_bd < 1:
            
            raise TypeError("!!! invalid upper bound setting, must >= 1 !!!")
            
        if lower_bd > upper_bd:
            
            raise TypeError("!!! lower bound must not exced upper bound !!!")
            
        if s not in one_hop:
            
            raise ValueError('!!! entity not in one_hop. Please work on existing entities')

        #here is the result dict. Its key is each entity t sharing paths from s
        #The value of each t is a set containing the paths from s to t
        #These paths can be either the direct connection r, or a multi-hop path
        res = defaultdict(set)
        
        #qualified_t contains the types of t we want to consider,
        #that is, what t will be added to the result set.
        qualified_t = set()

        #under this mode, we will only consider the direct neighbour of s
        if mode == 'direct_neighbour':
        
            for Tuple in one_hop[s]:
            
                t = Tuple[1]
                
                qualified_t.add(t)
        
        #under this mode, we will only consider one specified entity t
        elif mode == 'target_specified':
            
            qualified_t.add(t_input)
        
        #under this mode, we will consider any entity
        elif mode == 'any_target':
            
            for s_any in one_hop:
                
                qualified_t.add(s_any)
                
        else:
            
            raise ValueError('not a valid mode')
        
        '''
        We use recursion to find the paths
        On current node with the path [r1, ..., rk] and on-path entities {s, e1, ..., ek-1, node}
        from s to this node, we will further find the direct neighbor t' of this node. 
        If t' is not an on-path entity (not among s, e1,...ek-1, node), we recursively proceed to t' 
        '''
        def helper(node, path, on_path_en, res, qualified_t, lower_bd, upper_bd, one_hop, count_dict):

            #when the current path is within lower_bd and upper_bd, 
            #and the node is among the qualified t, and it has not been fill of paths w.r.t size_limit,
            #we will add this path to the node
            if (len(path) >= lower_bd) and (len(path) <= upper_bd) and (
                node in qualified_t) and (len(res[node]) < self.size_bd):
                
                res[node].add(tuple(path))
                    
            #won't start new recursions if the current path length already reaches upper limit
            #or the number of recursions performed on this length has reached the limit
            if (len(path) < upper_bd) and (count_dict[len(path)] <= self.threshold):
                                
                #temp list is the id list for us to go-over one_hop[node]
                temp_list = [i for i in range(len(one_hop[node]))]
                random.shuffle(temp_list) #so we random-shuffle the list
                
                #only take 20 recursions if there are too many (r,t)
                for i in temp_list[:self.amount_bd]:
                    
                    #obtain tuple of (r,t)
                    Tuple = one_hop[node][i]
                    r, t = Tuple[0], Tuple[1]
                    
                    #add to count_dict even if eventually this step not proceed
                    count_dict[len(path)] += 1
                    
                    #if t not on the path and we not exceed the computation threshold, 
                    #then finally proceed to next recursion
                    if (t not in on_path_en) and (count_dict[len(path)] <= self.threshold):

                        helper(t, path + [r], on_path_en.union({t}), res, qualified_t, 
                               lower_bd, upper_bd, one_hop, count_dict)

        length_dict = defaultdict(int)
        count_dict = defaultdict(int)
        
        helper(s, [], {s}, res, qualified_t, lower_bd, upper_bd, one_hop, count_dict)
        
        return(res, count_dict)

In [9]:
#load the classes
Class_1 = LoadKG()
Class_2 = ObtainPathsByDynamicProgramming()

In [10]:
#load ids and relation/entity dicts
with open('../weight_bin/' + ids_name + '.pickle', 'rb') as handle:
    Dict = pickle.load(handle)
    
#save training data
one_hop = Dict['one_hop']
data = Dict['data']
s_t_r = Dict['s_t_r']

#save valid data
one_hop_valid = Dict['one_hop_valid']
data_valid = Dict['data_valid']
s_t_r_valid = Dict['s_t_r_valid']

#save test data
one_hop_test = Dict['one_hop_test']
data_test = Dict['data_test']
s_t_r_test = Dict['s_t_r_test']

#save shared dictionaries
entity2id = Dict['entity2id']
id2entity = Dict['id2entity']
relation2id = Dict['relation2id']
id2relation = Dict['id2relation']

#we want to keep the initial entity/relation dicts before adding new entities
entity2id_ini = deepcopy(entity2id)
id2entity_ini = deepcopy(id2entity)
relation2id_ini = deepcopy(relation2id)
id2relation_ini = deepcopy(id2relation)

num_r = len(id2relation)
num_r

400

In [11]:
ids_name

'IDs_SiaLP_6_new_fb237_v2'

In [12]:
model_name

'Model_SiaLP_6_new_fb237_v2'

In [13]:
#load the model
model = keras.models.load_model('../weight_bin/' + model_name + '.h5')

2023-05-16 13:18:10.627762: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
#load the one-hop neighbor model
model_2 = keras.models.load_model('../weight_bin/' + one_hop_model_name + '.h5')

In [15]:
ind_train_path = '../data/' + data_name + '_ind/train.txt'
ind_valid_path = '../data/' + data_name + '_ind/valid.txt'
ind_test_path = '../data/' + data_name + '_ind/test.txt'

In [16]:
#load the test dataset
one_hop_ind = dict() 
data_ind = set()
s_t_r_ind = defaultdict(set)

len_0 = len(relation2id)
size_0 = len(entity2id)

#fill in the sets and dicts
Class_1.load_train_data(ind_train_path, 
                        one_hop_ind, data_ind, s_t_r_ind,
                        entity2id, id2entity, relation2id, id2relation)

len_1 = len(relation2id)
size_1 = len(entity2id)

if len_0 != len_1:
    raise ValueError('unseen relation!')

In [17]:
print(size_0, size_1, len(data_ind))

2608 4268 4145


In [18]:
#load the test dataset
one_hop_ind_test = dict() 
data_ind_test = set()
s_t_r_ind_test = defaultdict(set)

len_0 = len(relation2id)
size_0 = len(entity2id)

#fill in the sets and dicts
Class_1.load_train_data(ind_test_path, 
                        one_hop_ind_test, data_ind_test, s_t_r_ind_test,
                        entity2id, id2entity, relation2id, id2relation)


len_1 = len(relation2id)
size_1 = len(entity2id)

if len_0 != len_1:
    raise ValueError('unseen relation!')

In [19]:
print(size_0, size_1, len(data_ind_test))

4268 4268 478


In [20]:
#load the validation for existing triple removal when ranking
one_hop_ind_valid = dict() 
data_ind_valid = set()
s_t_r_ind_valid = defaultdict(set)

len_0 = len(relation2id)
size_0 = len(entity2id)

#fill in the sets and dicts
Class_1.load_train_data(ind_valid_path, 
                        one_hop_ind_valid, data_ind_valid, s_t_r_ind_valid,
                        entity2id, id2entity, relation2id, id2relation)

len_1 = len(relation2id)
size_1 = len(entity2id)

if len_0 != len_1:
    raise ValueError('unseen relation!')

In [21]:
print(size_0, size_1, len(data_ind_valid))

4268 4268 469


In [22]:
print(len(entity2id), len(entity2id_ini))

4268 2608


In [23]:
#obtain all the inital entities and new entities
ini_ent_set, new_ent_set, all_ent_set = set(), set(), set()

for ID in id2entity:
    all_ent_set.add(ID)
    if ID in id2entity_ini:
        ini_ent_set.add(ID)
    else:
        new_ent_set.add(ID)
        
print(len(ini_ent_set), len(new_ent_set), len(all_ent_set))

2608 1660 4268


In [24]:
#we want to check whether there are overlapping 
#between the entities of train triples and inductive test and valid triples
overlapping = 0

for ele in data_ind_test:
    
    s, r, t = ele[0], ele[1], ele[2]
    
    if s in id2entity_ini or t in id2entity_ini:
        
        overlapping += 1
        
overlapping

0

In [25]:
overlapping = 0

for ele in data_ind_valid:
    
    s, r, t = ele[0], ele[1], ele[2]
    
    if s in id2entity_ini or t in id2entity_ini:
        
        overlapping += 1
        
overlapping

0

In [26]:
#we want to check whether there are overlapping 
#between the entities of train triples and inductive test and valid triples
overlapping = 0

for ele in data_ind:
    
    s, r, t = ele[0], ele[1], ele[2]
    
    if s in id2entity_ini or t in id2entity_ini:
        
        overlapping += 1
        
overlapping

0

In [27]:
#the function to do path-based relation scoring
def path_based_relation_scoring(s, t, lower_bd, upper_bd, one_hop, id2relation, model):
    
    path_holder = set()
    
    for iteration in range(3):
    
        result, length_dict = Class_2.obtain_paths('target_specified', 
                                                   s, t, lower_bd, upper_bd, one_hop)
        if t in result:
            
            for path in result[t]:
                
                path_holder.add(path)
                
        del(result, length_dict)
    
    path_holder = list(path_holder)
    random.shuffle(path_holder)
    
    score_dict = defaultdict(float)
    count_dict = defaultdict(int)
    
    count = 0
    
    if len(path_holder) >= 3:
    
        #iterate over path_1
        while count < 10:

            temp_pair = random.sample(path_holder, 3)

            path_1, path_2, path_3 = temp_pair[0], temp_pair[1], temp_pair[2]

            list_1 = list()
            list_2 = list()
            list_3 = list()
            list_r = list()

            for i in range(len(id2relation)):

                if i not in id2relation:

                    raise ValueError ('error when generating id2relation')
                
                #only care about initial relations
                if i % 2 == 0:

                    list_1.append(list(path_1) + [num_r]*abs(len(path_1)-upper_bd))
                    list_2.append(list(path_2) + [num_r]*abs(len(path_2)-upper_bd))
                    list_3.append(list(path_3) + [num_r]*abs(len(path_3)-upper_bd))
                    list_r.append([i])
            
            #change to arrays
            input_1 = np.array(list_1)
            input_2 = np.array(list_2)
            input_3 = np.array(list_3)
            input_r = np.array(list_r)

            pred = model.predict([input_1, input_2, input_3, input_r], verbose = 0)

            for i in range(pred.shape[0]):
                #need to times 2 to go back to relation id from pred position
                score_dict[2*i] += float(pred[i])
                count_dict[2*i] += 1

            count += 1
            
    #average the score
    for r in score_dict:
        score_dict[r] = deepcopy(score_dict[r]/float(count_dict[r]))
    
    print(len(score_dict), len(path_holder))

    return(score_dict)

In [28]:
#the function to do path-based triple scoring: input one triple
def path_based_triple_scoring(s, r, t, lower_bd, upper_bd, one_hop, id2relation, model):
    
    path_holder = set()
    
    for iteration in range(3):
    
        result, length_dict = Class_2.obtain_paths('target_specified', 
                                                   s, t, lower_bd, upper_bd, one_hop)
        if t in result:
            
            for path in result[t]:
                
                path_holder.add(path)
                
        del(result, length_dict)
    
    path_holder = list(path_holder)
    random.shuffle(path_holder)
    
    score = 0.
    count = 0
    
    if len(path_holder) >= 3:
        
        list_1 = list()
        list_2 = list()
        list_3 = list()
        list_r = list()
    
        #iterate over path_1
        while count < 10:

            temp_pair = random.sample(path_holder, 3)
            path_1, path_2, path_3 = temp_pair[0], temp_pair[1], temp_pair[2]

            list_1.append(list(path_1) + [num_r]*abs(len(path_1)-upper_bd))
            list_2.append(list(path_2) + [num_r]*abs(len(path_2)-upper_bd))
            list_3.append(list(path_3) + [num_r]*abs(len(path_3)-upper_bd))
            list_r.append([r])
            
            count += 1
            
        #change to arrays
        input_1 = np.array(list_1)
        input_2 = np.array(list_2)
        input_3 = np.array(list_3)
        input_r = np.array(list_r)

        pred = model.predict([input_1, input_2, input_3, input_r], verbose = 0)

        for i in range(pred.shape[0]):
            score += float(pred[i])
            
        #average the score
        score = score/float(count)

    return(score)

In [29]:
#subgraph based relation scoring
def subgraph_relation_scoring(s, t, lower_bd, upper_bd, one_hop, id2relation, model_2):
    
    path_s, path_t = set(), set() #sets holding all the paths from s or t
    
    for iteration in range(3):
    
        #obtain the paths out from s or t by "any target" mode. That is, 
        result_s, length_dict_s = Class_2.obtain_paths('any_target', s, 'any', lower_bd, upper_bd, one_hop)
        result_t, length_dict_t = Class_2.obtain_paths('any_target', t, 'any', lower_bd, upper_bd, one_hop)

        #add paths to the source/target path_set
        for e in result_s:
            for path in result_s[e]:
                path_s.add(path)
        for e in result_t:
            for path in result_t[e]:
                path_t.add(path)
                
        del(result_s, length_dict_s, result_t, length_dict_t)
    
    #final output: the score dict
    score_dict = defaultdict(float)
    count_dict = defaultdict(int)
    
    #see if both path_s and path_t have at least three paths
    if len(path_s) >= 6 and len(path_t) >= 6:

        #change to lists
        path_s, path_t = list(path_s), list(path_t)
        
        count = 0
        while count < 10:
            
            #lists holding the input to the network
            list_s_1 = list()
            list_s_2 = list()
            list_s_3 = list()
            list_s_4 = list()
            list_s_5 = list()
            list_s_6 = list()
            
            list_t_1 = list()
            list_t_2 = list()
            list_t_3 = list()
            list_t_4 = list()
            list_t_5 = list()
            list_t_6 = list()

            list_r = list()

            #randomly obtain three paths
            temp_s = random.sample(path_s, 6)
            temp_t = random.sample(path_t, 6)
            s_p_1, s_p_2, s_p_3, s_p_4, s_p_5, s_p_6 = temp_s[0], temp_s[1], temp_s[2], temp_s[3], temp_s[4], temp_s[5]
            t_p_1, t_p_2, t_p_3, t_p_4, t_p_5, t_p_6 = temp_t[0], temp_t[1], temp_t[2], temp_t[3], temp_t[4], temp_t[5]

            #add all forward (initial relation)
            for i in range(len(id2relation)):

                if i not in id2relation:

                    raise ValueError ('error when generating id2relation')
                    
                if i % 2 == 0:

                    #append the paths: note that we add the space holder id at the end of the shorter path
                    list_s_1.append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                    list_s_2.append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                    list_s_3.append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                    list_s_4.append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                    list_s_5.append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                    list_s_6.append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))
                    
                    list_t_1.append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                    list_t_2.append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                    list_t_3.append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))                    
                    list_t_4.append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))                    
                    list_t_5.append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))                    
                    list_t_6.append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))                                    
                    
                    list_r.append([i])
                
            #change to arrays
            input_s_1 = np.array(list_s_1)
            input_s_2 = np.array(list_s_2)
            input_s_3 = np.array(list_s_3)
            input_s_4 = np.array(list_s_4)
            input_s_5 = np.array(list_s_5)
            input_s_6 = np.array(list_s_6)
            
            input_t_1 = np.array(list_t_1)
            input_t_2 = np.array(list_t_2)
            input_t_3 = np.array(list_t_3)
            input_t_4 = np.array(list_t_4)
            input_t_5 = np.array(list_t_5)
            input_t_6 = np.array(list_t_6)
            
            input_r = np.array(list_r)
            
            pred = model_2.predict([input_s_1, input_s_2, input_s_3, input_s_4,
                                    input_s_5, input_s_6,
                                    input_t_1, input_t_2, input_t_3, input_t_4,
                                    input_t_5, input_t_6,
                                    input_r], verbose = 0)

            for i in range(pred.shape[0]):
                #need to times 2 to go back to relation id from pred position
                score_dict[2*i] += float(pred[i])
                count_dict[2*i] += 1

            count += 1
            
    #average the score
    for r in score_dict:
        score_dict[r] = deepcopy(score_dict[r]/float(count_dict[r]))
            
    print(len(score_dict), len(path_s), len(path_t))
        
    return(score_dict)

In [30]:
#subgraph based triple scoring
def subgraph_triple_scoring(s, r, t, lower_bd, upper_bd, one_hop, id2relation, model_2):
    
    path_s, path_t = set(), set() #sets holding all the paths from s or t
    
    for iteration in range(3):
    
        #obtain the paths out from s or t by "any target" mode. That is, 
        result_s, length_dict_s = Class_2.obtain_paths('any_target', s, 'any', lower_bd, upper_bd, one_hop)
        result_t, length_dict_t = Class_2.obtain_paths('any_target', t, 'any', lower_bd, upper_bd, one_hop)

        #add paths to the source/target path_set
        for e in result_s:
            for path in result_s[e]:
                path_s.add(path)
        for e in result_t:
            for path in result_t[e]:
                path_t.add(path)
                
        del(result_s, length_dict_s, result_t, length_dict_t)
    
    #final output: the score dict
    score = 0.
    
    #see if both path_s and path_t have at least three paths
    if len(path_s) >= 6 and len(path_t) >= 6:

        #change to lists
        path_s, path_t = list(path_s), list(path_t)
        
        #lists holding the input to the network
        list_s_1 = list()
        list_s_2 = list()
        list_s_3 = list()
        list_s_4 = list()
        list_s_5 = list()
        list_s_6 = list()

        list_t_1 = list()
        list_t_2 = list()
        list_t_3 = list()
        list_t_4 = list()
        list_t_5 = list()
        list_t_6 = list()
        
        list_r = list()
        
        count = 0
        while count < 10:

            #randomly obtain three paths
            temp_s = random.sample(path_s, 6)
            temp_t = random.sample(path_t, 6)
            s_p_1, s_p_2, s_p_3, s_p_4, s_p_5, s_p_6 = temp_s[0], temp_s[1], temp_s[2], temp_s[3], temp_s[4], temp_s[5]
            t_p_1, t_p_2, t_p_3, t_p_4, t_p_5, t_p_6 = temp_t[0], temp_t[1], temp_t[2], temp_t[3], temp_t[4], temp_t[5]

            #append the paths: note that we add the space holder id at the end of the shorter path
            list_s_1.append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
            list_s_2.append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
            list_s_3.append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
            list_s_4.append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
            list_s_5.append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
            list_s_6.append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

            list_t_1.append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
            list_t_2.append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
            list_t_3.append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))                    
            list_t_4.append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))                    
            list_t_5.append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))                    
            list_t_6.append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))                    

            list_r.append([r])
            count += 1
                
        #change to arrays
        input_s_1 = np.array(list_s_1)
        input_s_2 = np.array(list_s_2)
        input_s_3 = np.array(list_s_3)
        input_s_4 = np.array(list_s_4)
        input_s_5 = np.array(list_s_5)
        input_s_6 = np.array(list_s_6)

        input_t_1 = np.array(list_t_1)
        input_t_2 = np.array(list_t_2)
        input_t_3 = np.array(list_t_3)
        input_t_4 = np.array(list_t_4)
        input_t_5 = np.array(list_t_5)
        input_t_6 = np.array(list_t_6)
        
        input_r = np.array(list_r)

        pred = model_2.predict([input_s_1, input_s_2, input_s_3, input_s_4,
                                input_s_5, input_s_6, 
                                input_t_1, input_t_2, input_t_3, input_t_4,
                                input_t_5, input_t_6, 
                                input_r], verbose = 0)

        for i in range(pred.shape[0]):
            score += float(pred[i])

        #average the score
        score = score/float(count)
        
    return(score)

#### Not fine tuned 

In [31]:
########################################################
#obtain the Hits@N for relation prediction##############

#we select all the triples in the inductive test set
selected = list(data_ind_test)

###Hit at 1#############################
#generate the negative samples by randomly replace relation with all the other relaiton
Hits_at_1 = 0
Hits_at_3 = 0
Hits_at_10 = 0
MRR_raw = 0.

for i in range(len(selected)):
    
    s_true, r_true, t_true = selected[i][0], selected[i][1], selected[i][2]
    
    #run the path-based scoring
    score_dict_path = path_based_relation_scoring(s_true, t_true, lower_bound, upper_bound_path, one_hop_ind, id2relation, model)
    
    #run the one-hop neighbour based scoring
    score_dict_subg = subgraph_relation_scoring(s_true, t_true, lower_bound, upper_bound_subg, one_hop_ind, id2relation, model_2)
    
    #final score dict
    score_dict = defaultdict(float)
    
    for r in score_dict_path:
        score_dict[r] += score_dict_path[r]
    for r in score_dict_subg:
        score_dict[r] += score_dict_subg[r]
    
    #[... [score, r], ...]
    temp_list = list()
    
    for r in id2relation:
        
        #again, we only care about initial relation prediciton
        if r % 2 == 0:
        
            if r in score_dict:

                temp_list.append([score_dict[r], r])

            else:

                temp_list.append([0.0, r])
        
    sorted_list = sorted(temp_list, key = lambda x: x[0], reverse=True)
    
    p = 0
    exist_tri = 0
    
    while p < len(sorted_list) and sorted_list[p][1] != r_true:
        
        #moreover, we want to remove existing triples
        if ((s_true, sorted_list[p][1], t_true) in data_test) or (
            (s_true, sorted_list[p][1], t_true) in data_valid) or (
            (s_true, sorted_list[p][1], t_true) in data) or (
            (s_true, sorted_list[p][1], t_true) in data_ind) or (
            (s_true, sorted_list[p][1], t_true) in data_ind_valid) or (
            (s_true, sorted_list[p][1], t_true) in data_ind_test):
            
            exist_tri += 1
            
        p += 1
    
    if p - exist_tri == 0:
        
        Hits_at_1 += 1
        
    if p - exist_tri < 3:
        
        Hits_at_3 += 1
        
    if p - exist_tri < 10:
        
        Hits_at_10 += 1
        
    MRR_raw += 1./float(p - exist_tri + 1.) 
        
    print('checkcorrect', r_true, sorted_list[p][1],
          'real score', sorted_list[p][0],
          'Hits@1', Hits_at_1/(i+1),
          'Hits@3', Hits_at_3/(i+1),
          'Hits@10', Hits_at_10/(i+1),
          'MRR', MRR_raw/(i+1),
          'cur_rank', p - exist_tri,
          'abs_cur_rank', p,
          'total_num', i, len(selected))

200 148
200 144 121
checkcorrect 222 222 real score 1.9446585416793822 Hits@1 1.0 Hits@3 1.0 Hits@10 1.0 MRR 1.0 cur_rank 0 abs_cur_rank 0 total_num 0 478
0 0
0 3 521
checkcorrect 68 68 real score 0.0 Hits@1 0.5 Hits@3 0.5 Hits@10 0.5 MRR 0.5142857142857142 cur_rank 34 abs_cur_rank 34 total_num 1 478
0 0
200 311 33
checkcorrect 22 22 real score 0.4203387677669525 Hits@1 0.3333333333333333 Hits@3 0.3333333333333333 Hits@10 0.6666666666666666 MRR 0.3984126984126984 cur_rank 5 abs_cur_rank 5 total_num 2 478
200 105
200 334 225
checkcorrect 64 64 real score 1.8876660525798798 Hits@1 0.5 Hits@3 0.5 Hits@10 0.75 MRR 0.5488095238095239 cur_rank 0 abs_cur_rank 0 total_num 3 478
200 137
200 64 170
checkcorrect 300 300 real score 1.782987189292908 Hits@1 0.6 Hits@3 0.6 Hits@10 0.8 MRR 0.6390476190476191 cur_rank 0 abs_cur_rank 0 total_num 4 478
200 39
200 122 70
checkcorrect 100 100 real score 1.6852366983890534 Hits@1 0.6666666666666666 Hits@3 0.6666666666666666 Hits@10 0.8333333333333334 MRR 0

200 103
200 57 26
checkcorrect 164 164 real score 1.9601969301700592 Hits@1 0.7209302325581395 Hits@3 0.8604651162790697 Hits@10 0.9069767441860465 MRR 0.7915196522731385 cur_rank 0 abs_cur_rank 0 total_num 42 478
200 128
200 206 148
checkcorrect 64 64 real score 1.869527292251587 Hits@1 0.7272727272727273 Hits@3 0.8636363636363636 Hits@10 0.9090909090909091 MRR 0.7962578419942035 cur_rank 0 abs_cur_rank 0 total_num 43 478
200 150
200 140 116
checkcorrect 190 190 real score 1.820012629032135 Hits@1 0.7333333333333333 Hits@3 0.8666666666666667 Hits@10 0.9111111111111111 MRR 0.8007854455054434 cur_rank 0 abs_cur_rank 0 total_num 44 478
200 124
200 46 167
checkcorrect 42 42 real score 1.8813245475292204 Hits@1 0.7391304347826086 Hits@3 0.8695652173913043 Hits@10 0.9130434782608695 MRR 0.8051161966901077 cur_rank 0 abs_cur_rank 0 total_num 45 478
200 150
200 96 174
checkcorrect 190 190 real score 1.9212297737598418 Hits@1 0.7446808510638298 Hits@3 0.8723404255319149 Hits@10 0.9148936170212

200 18
200 664 19
checkcorrect 192 192 real score 1.331368061900139 Hits@1 0.7951807228915663 Hits@3 0.891566265060241 Hits@10 0.9397590361445783 MRR 0.8493860684082162 cur_rank 3 abs_cur_rank 3 total_num 82 478
200 142
200 25 322
checkcorrect 100 100 real score 1.9008499443531037 Hits@1 0.7976190476190477 Hits@3 0.8928571428571429 Hits@10 0.9404761904761905 MRR 0.8511790914033565 cur_rank 0 abs_cur_rank 0 total_num 83 478
200 150
200 352 741
checkcorrect 124 124 real score 1.9533068180084228 Hits@1 0.788235294117647 Hits@3 0.8941176470588236 Hits@10 0.9411764705882353 MRR 0.8470475726809641 cur_rank 1 abs_cur_rank 1 total_num 84 478
0 2
200 10 207
checkcorrect 146 146 real score 0.814994078874588 Hits@1 0.7906976744186046 Hits@3 0.8953488372093024 Hits@10 0.9418604651162791 MRR 0.848826089277697 cur_rank 0 abs_cur_rank 0 total_num 85 478
0 1
200 156 54
checkcorrect 192 192 real score 0.7768915951251983 Hits@1 0.7931034482758621 Hits@3 0.896551724137931 Hits@10 0.9425287356321839 MRR 0

200 94
200 321 198
checkcorrect 42 42 real score 1.8755837380886078 Hits@1 0.8114754098360656 Hits@3 0.9098360655737705 Hits@10 0.9590163934426229 MRR 0.8666002799514132 cur_rank 0 abs_cur_rank 0 total_num 121 478
200 66
200 132 138
checkcorrect 2 2 real score 1.9263771712779998 Hits@1 0.8130081300813008 Hits@3 0.9105691056910569 Hits@10 0.959349593495935 MRR 0.8676848305209139 cur_rank 0 abs_cur_rank 1 total_num 122 478
200 150
200 686 864
checkcorrect 124 124 real score 1.9717475414276122 Hits@1 0.8145161290322581 Hits@3 0.9112903225806451 Hits@10 0.9596774193548387 MRR 0.8687518883392936 cur_rank 0 abs_cur_rank 1 total_num 123 478
200 150
200 129 607
checkcorrect 128 128 real score 1.9897211253643037 Hits@1 0.816 Hits@3 0.912 Hits@10 0.96 MRR 0.8698018732325793 cur_rank 0 abs_cur_rank 0 total_num 124 478
200 20
200 51 66
checkcorrect 66 66 real score 1.940504390001297 Hits@1 0.8174603174603174 Hits@3 0.9126984126984127 Hits@10 0.9603174603174603 MRR 0.8708351916989874 cur_rank 0 abs

200 148
200 293 726
checkcorrect 46 46 real score 1.9694089233875274 Hits@1 0.8148148148148148 Hits@3 0.9135802469135802 Hits@10 0.9629629629629629 MRR 0.8692613098351117 cur_rank 0 abs_cur_rank 0 total_num 161 478
0 1
200 96 25
checkcorrect 66 66 real score 0.6770562410354615 Hits@1 0.8098159509202454 Hits@3 0.9079754601226994 Hits@10 0.9631901840490797 MRR 0.865462160695019 cur_rank 3 abs_cur_rank 3 total_num 162 478
200 146
200 825 186
checkcorrect 48 48 real score 1.9352295398712158 Hits@1 0.8109756097560976 Hits@3 0.9085365853658537 Hits@10 0.9634146341463414 MRR 0.8662825133737079 cur_rank 0 abs_cur_rank 0 total_num 163 478
200 147
200 104 211
checkcorrect 36 36 real score 1.9365537881851198 Hits@1 0.8121212121212121 Hits@3 0.9090909090909091 Hits@10 0.9636363636363636 MRR 0.8670929223835642 cur_rank 0 abs_cur_rank 0 total_num 164 478
200 150
200 846 859
checkcorrect 82 82 real score 1.9574581801891326 Hits@1 0.8132530120481928 Hits@3 0.9096385542168675 Hits@10 0.963855421686747 

200 7
200 54 22
checkcorrect 64 64 real score 1.569327250123024 Hits@1 0.8009950248756219 Hits@3 0.9054726368159204 Hits@10 0.9552238805970149 MRR 0.8603489268455091 cur_rank 0 abs_cur_rank 0 total_num 200 478
0 2
200 15 27
checkcorrect 70 70 real score 0.8975332796573638 Hits@1 0.801980198019802 Hits@3 0.905940594059406 Hits@10 0.9554455445544554 MRR 0.8610402687918185 cur_rank 0 abs_cur_rank 1 total_num 201 478
200 150
200 633 657
checkcorrect 82 82 real score 1.9819040834903716 Hits@1 0.8029556650246306 Hits@3 0.9064039408866995 Hits@10 0.9556650246305419 MRR 0.8617247994874253 cur_rank 0 abs_cur_rank 0 total_num 202 478
200 150
200 294 393
checkcorrect 64 64 real score 1.8909637987613679 Hits@1 0.803921568627451 Hits@3 0.9068627450980392 Hits@10 0.9558823529411765 MRR 0.862402619097781 cur_rank 0 abs_cur_rank 0 total_num 203 478
200 34
200 26 16
checkcorrect 70 70 real score 1.6377105832099916 Hits@1 0.8048780487804879 Hits@3 0.9073170731707317 Hits@10 0.9560975609756097 MRR 0.8630

200 53
200 164 703
checkcorrect 170 170 real score 1.6493695676326752 Hits@1 0.8208333333333333 Hits@3 0.9083333333333333 Hits@10 0.9541666666666667 MRR 0.871307947311335 cur_rank 0 abs_cur_rank 2 total_num 239 478
200 150
200 144 294
checkcorrect 78 78 real score 1.9615601181983946 Hits@1 0.8215767634854771 Hits@3 0.9087136929460581 Hits@10 0.9543568464730291 MRR 0.871841939231205 cur_rank 0 abs_cur_rank 0 total_num 240 478
200 150
200 272 403
checkcorrect 46 46 real score 1.9739081799983977 Hits@1 0.8223140495867769 Hits@3 0.9090909090909091 Hits@10 0.9545454545454546 MRR 0.8723715179947124 cur_rank 0 abs_cur_rank 0 total_num 241 478
200 71
200 59 260
checkcorrect 42 42 real score 1.7682445704936982 Hits@1 0.823045267489712 Hits@3 0.9094650205761317 Hits@10 0.9547325102880658 MRR 0.8728967380852691 cur_rank 0 abs_cur_rank 1 total_num 242 478
200 48
200 79 25
checkcorrect 196 196 real score 1.9366883873939513 Hits@1 0.8237704918032787 Hits@3 0.9098360655737705 Hits@10 0.95491803278688

200 114
200 173 184
checkcorrect 42 42 real score 1.5741823256015777 Hits@1 0.8207885304659498 Hits@3 0.921146953405018 Hits@10 0.9605734767025089 MRR 0.8743628698496551 cur_rank 0 abs_cur_rank 1 total_num 278 478
200 150
200 611 582
checkcorrect 48 48 real score 1.9636188209056855 Hits@1 0.8214285714285714 Hits@3 0.9214285714285714 Hits@10 0.9607142857142857 MRR 0.8748115738859064 cur_rank 0 abs_cur_rank 0 total_num 279 478
200 150
200 494 576
checkcorrect 42 42 real score 1.8146029472351075 Hits@1 0.8220640569395018 Hits@3 0.9217081850533808 Hits@10 0.9608540925266904 MRR 0.8752570842991237 cur_rank 0 abs_cur_rank 1 total_num 280 478
200 125
200 70 610
checkcorrect 12 12 real score 1.7273997008800506 Hits@1 0.8226950354609929 Hits@3 0.9219858156028369 Hits@10 0.9609929078014184 MRR 0.8756994350640205 cur_rank 0 abs_cur_rank 0 total_num 281 478
200 150
200 637 374
checkcorrect 2 2 real score 1.8127513885498048 Hits@1 0.8197879858657244 Hits@3 0.9222614840989399 Hits@10 0.9611307420494

200 70
200 17 322
checkcorrect 100 100 real score 1.8761775135993957 Hits@1 0.8144654088050315 Hits@3 0.9182389937106918 Hits@10 0.9559748427672956 MRR 0.8704724898441649 cur_rank 0 abs_cur_rank 0 total_num 317 478
200 66
200 836 69
checkcorrect 248 248 real score 1.7748019576072693 Hits@1 0.8150470219435737 Hits@3 0.9184952978056427 Hits@10 0.9561128526645768 MRR 0.8708785321957506 cur_rank 0 abs_cur_rank 0 total_num 318 478
200 97
200 235 287
checkcorrect 12 12 real score 1.809088009595871 Hits@1 0.815625 Hits@3 0.91875 Hits@10 0.95625 MRR 0.8712820367826388 cur_rank 0 abs_cur_rank 0 total_num 319 478
200 92
200 142 100
checkcorrect 156 156 real score 1.8132616221904754 Hits@1 0.8161993769470405 Hits@3 0.9190031152647975 Hits@10 0.956386292834891 MRR 0.8716830273222568 cur_rank 0 abs_cur_rank 0 total_num 320 478
200 150
200 561 269
checkcorrect 92 92 real score 1.9729817926883697 Hits@1 0.8167701863354038 Hits@3 0.9192546583850931 Hits@10 0.9565217391304348 MRR 0.8720815272374051 cur

200 32
200 119 82
checkcorrect 12 12 real score 1.8764046549797058 Hits@1 0.8179271708683473 Hits@3 0.9243697478991597 Hits@10 0.957983193277311 MRR 0.873436622652663 cur_rank 0 abs_cur_rank 0 total_num 356 478
200 150
200 246 306
checkcorrect 100 100 real score 1.852527630329132 Hits@1 0.8184357541899442 Hits@3 0.9245810055865922 Hits@10 0.9581005586592178 MRR 0.8737901516396668 cur_rank 0 abs_cur_rank 0 total_num 357 478
200 143
200 98 150
checkcorrect 194 194 real score 1.848041445016861 Hits@1 0.8189415041782729 Hits@3 0.924791086350975 Hits@10 0.958217270194986 MRR 0.8741417111058515 cur_rank 0 abs_cur_rank 0 total_num 358 478
200 150
200 853 254
checkcorrect 82 82 real score 1.9827720820903778 Hits@1 0.8194444444444444 Hits@3 0.925 Hits@10 0.9583333333333334 MRR 0.8744913174638909 cur_rank 0 abs_cur_rank 0 total_num 359 478
200 66
200 131 139
checkcorrect 66 66 real score 1.9480896830558776 Hits@1 0.8199445983379502 Hits@3 0.925207756232687 Hits@10 0.9584487534626038 MRR 0.874838

200 14
200 145 89
checkcorrect 2 2 real score 1.9206718444824218 Hits@1 0.8156565656565656 Hits@3 0.9217171717171717 Hits@10 0.9545454545454546 MRR 0.8714935355201221 cur_rank 0 abs_cur_rank 1 total_num 395 478
200 58
200 14 99
checkcorrect 244 244 real score 1.6732451796531678 Hits@1 0.8161209068010076 Hits@3 0.9219143576826196 Hits@10 0.9546599496221663 MRR 0.8718172293853107 cur_rank 0 abs_cur_rank 0 total_num 396 478
200 97
200 64 50
checkcorrect 12 12 real score 1.742203164100647 Hits@1 0.8165829145728644 Hits@3 0.9221105527638191 Hits@10 0.9547738693467337 MRR 0.8721392966481617 cur_rank 0 abs_cur_rank 0 total_num 397 478
200 145
200 88 81
checkcorrect 78 78 real score 1.9690884947776794 Hits@1 0.8170426065162907 Hits@3 0.9223057644110275 Hits@10 0.9548872180451128 MRR 0.8724597495387678 cur_rank 0 abs_cur_rank 0 total_num 398 478
200 9
200 45 48
checkcorrect 134 134 real score 1.6845453560352326 Hits@1 0.815 Hits@3 0.9225 Hits@10 0.955 MRR 0.8711119334982541 cur_rank 2 abs_cur_r

200 142
200 665 139
checkcorrect 12 12 real score 1.5969957798719405 Hits@1 0.8091954022988506 Hits@3 0.9218390804597701 Hits@10 0.9563218390804598 MRR 0.8681533252327006 cur_rank 0 abs_cur_rank 0 total_num 434 478
200 150
200 112 89
checkcorrect 50 50 real score 1.8161348164081574 Hits@1 0.8096330275229358 Hits@3 0.9220183486238532 Hits@10 0.9564220183486238 MRR 0.8684557258629008 cur_rank 0 abs_cur_rank 0 total_num 435 478
200 62
200 246 40
checkcorrect 168 168 real score 1.419528493285179 Hits@1 0.8100686498855835 Hits@3 0.9221967963386728 Hits@10 0.9565217391304348 MRR 0.8687567425085234 cur_rank 0 abs_cur_rank 0 total_num 436 478
200 150
200 511 377
checkcorrect 12 12 real score 1.9085771441459656 Hits@1 0.8105022831050228 Hits@3 0.9223744292237442 Hits@10 0.95662100456621 MRR 0.869056384648915 cur_rank 0 abs_cur_rank 0 total_num 437 478
200 14
200 922 58
checkcorrect 30 30 real score 1.6911819577217102 Hits@1 0.8109339407744874 Hits@3 0.9225512528473804 Hits@10 0.9567198177676538

200 4
200 147 8
checkcorrect 148 148 real score 1.821870172023773 Hits@1 0.8059071729957806 Hits@3 0.919831223628692 Hits@10 0.9556962025316456 MRR 0.8658617286516069 cur_rank 0 abs_cur_rank 0 total_num 473 478
200 63
200 409 36
checkcorrect 20 20 real score 1.8872158586978913 Hits@1 0.8063157894736842 Hits@3 0.92 Hits@10 0.9557894736842105 MRR 0.8661441250123404 cur_rank 0 abs_cur_rank 0 total_num 474 478
200 150
200 606 845
checkcorrect 12 12 real score 1.8857073843479157 Hits@1 0.8067226890756303 Hits@3 0.9201680672268907 Hits@10 0.9558823529411765 MRR 0.8664253348337431 cur_rank 0 abs_cur_rank 0 total_num 475 478
200 19
200 75 83
checkcorrect 22 22 real score 1.8042038023471831 Hits@1 0.8050314465408805 Hits@3 0.9203354297693921 Hits@10 0.9559748427672956 MRR 0.8656571475489763 cur_rank 1 abs_cur_rank 1 total_num 476 478
200 150
200 609 339
checkcorrect 48 48 real score 1.8893370509147644 Hits@1 0.805439330543933 Hits@3 0.9205020920502092 Hits@10 0.9560669456066946 MRR 0.8659381995

In [32]:
###########################################
##obtain the AUC-PR for the test triples###
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve
import matplotlib.pyplot as plt

#we select all the triples in the inductive test set
pos_triples = list(data_ind_test)

#we build the negative samples by randomly replace head or tail entity in the triple.
neg_triples = list()

for i in range(len(pos_triples)):
    
    s_pos, r_pos, t_pos = pos_triples[i][0], pos_triples[i][1], pos_triples[i][2]
    
    #decide to replace the head or tail entity
    number_0 = random.uniform(0, 1)
    
    if number_0 < 0.5: #replace head entity
        s_neg = random.choice(list(new_ent_set))
        
        #filter out the existing triples
        while ((s_neg, r_pos, t_pos) in data_test) or (
               (s_neg, r_pos, t_pos) in data_valid) or (
               (s_neg, r_pos, t_pos) in data) or (
               (s_neg, r_pos, t_pos) in data_ind) or (
               (s_neg, r_pos, t_pos) in data_ind_valid) or (
               (s_neg, r_pos, t_pos) in data_ind_test):
            
            s_neg = random.choice(list(new_ent_set))
        
        neg_triples.append((s_neg, r_pos, t_pos))
    
    else: #replace tail entity
        t_neg = random.choice(list(new_ent_set))
        
        #filter out the existing triples
        while ((s_pos, r_pos, t_neg) in data_test) or (
               (s_pos, r_pos, t_neg) in data_valid) or (
               (s_pos, r_pos, t_neg) in data) or (
               (s_pos, r_pos, t_neg) in data_ind) or (
               (s_pos, r_pos, t_neg) in data_ind_valid) or (
               (s_pos, r_pos, t_neg) in data_ind_test):
            
            t_neg = random.choice(list(new_ent_set))
        
        neg_triples.append((s_pos, r_pos, t_neg))

if len(pos_triples) != len(neg_triples):
    raise ValueError('error when generating negative triples')
        
#combine all triples
all_triples = pos_triples + neg_triples

#obtain the label array
arr1 = np.ones((len(pos_triples),))
arr2 = np.zeros((len(neg_triples),))
y_test = np.concatenate((arr1, arr2))

#shuffle positive and negative triples (optional)
all_triples, y_test = shuffle(all_triples, y_test)

#obtain the score aray
y_score = np.zeros((len(y_test),))

#implement the scoring
for i in range(len(all_triples)):
    
    s, r, t = all_triples[i][0], all_triples[i][1], all_triples[i][2]
    
    path_score = path_based_triple_scoring(s, r, t, lower_bound, upper_bound_path, one_hop_ind, id2relation, model)
    
    subg_score = subgraph_triple_scoring(s, r, t, lower_bound, upper_bound_subg, one_hop_ind, id2relation, model_2)
    
    ave_score = (path_score + subg_score)/float(2)
    
    y_score[i] = ave_score
    
    if i % 20 == 0 and i > 0:
        print('evaluating scores', i, len(all_triples))
        
        # Data to plot precision - recall curve
        precision, recall, thresholds = precision_recall_curve(y_test[:i], y_score[:i])
        # Use AUC function to calculate the area under the curve of precision recall curve
        auc_precision_recall = auc(recall, precision)
        print('AUC-PR is:', auc_precision_recall)
        
        
# Data to plot precision - recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
# Use AUC function to calculate the area under the curve of precision recall curve
auc_precision_recall = auc(recall, precision)
print('AUC-PR is:', auc_precision_recall)

evaluating scores 20 956
AUC-PR is: 0.8766954416770595
evaluating scores 40 956
AUC-PR is: 0.9093881985703731
evaluating scores 60 956
AUC-PR is: 0.8970946922498327
evaluating scores 80 956
AUC-PR is: 0.8863860299550168
evaluating scores 100 956
AUC-PR is: 0.8971731916799096
evaluating scores 120 956
AUC-PR is: 0.8877589980228919
evaluating scores 140 956
AUC-PR is: 0.8889968988553405
evaluating scores 160 956
AUC-PR is: 0.8982832411291368
evaluating scores 180 956
AUC-PR is: 0.9001539698590233
evaluating scores 200 956
AUC-PR is: 0.900289804321931
evaluating scores 220 956
AUC-PR is: 0.9037671526647635
evaluating scores 240 956
AUC-PR is: 0.9014719291552402
evaluating scores 260 956
AUC-PR is: 0.9049807707380069
evaluating scores 280 956
AUC-PR is: 0.9062918512538483
evaluating scores 300 956
AUC-PR is: 0.909685805793341
evaluating scores 320 956
AUC-PR is: 0.9152205040563294
evaluating scores 340 956
AUC-PR is: 0.9165924690295068
evaluating scores 360 956
AUC-PR is: 0.921091033554405

In [33]:
##########################################################
##obtain the AUC-PR for the test triples, using sklearn###
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve
import matplotlib.pyplot as plt

#we select all the triples in the inductive test set
pos_triples = list(data_ind_test)

#we build the negative samples by randomly replace head or tail entity in the triple.
neg_triples = list()

for i in range(len(pos_triples)):
    
    s_pos, r_pos, t_pos = pos_triples[i][0], pos_triples[i][1], pos_triples[i][2]
    
    #decide to replace the head or tail entity
    number_0 = random.uniform(0, 1)
    
    if number_0 < 0.5: #replace head entity
        s_neg = random.choice(list(new_ent_set))
        
        #filter out the existing triples
        while ((s_neg, r_pos, t_pos) in data_test) or (
               (s_neg, r_pos, t_pos) in data_valid) or (
               (s_neg, r_pos, t_pos) in data) or (
               (s_neg, r_pos, t_pos) in data_ind) or (
               (s_neg, r_pos, t_pos) in data_ind_valid) or (
               (s_neg, r_pos, t_pos) in data_ind_test):
            
            s_neg = random.choice(list(new_ent_set))
        
        neg_triples.append((s_neg, r_pos, t_pos))
    
    else: #replace tail entity
        t_neg = random.choice(list(new_ent_set))
        
        #filter out the existing triples
        while ((s_pos, r_pos, t_neg) in data_test) or (
               (s_pos, r_pos, t_neg) in data_valid) or (
               (s_pos, r_pos, t_neg) in data) or (
               (s_pos, r_pos, t_neg) in data_ind) or (
               (s_pos, r_pos, t_neg) in data_ind_valid) or (
               (s_pos, r_pos, t_neg) in data_ind_test):
            
            t_neg = random.choice(list(new_ent_set))
        
        neg_triples.append((s_pos, r_pos, t_neg))

if len(pos_triples) != len(neg_triples):
    raise ValueError('error when generating negative triples')
        
#combine all triples
all_triples = pos_triples + neg_triples

#obtain the label array
arr1 = np.ones((len(pos_triples),))
arr2 = np.zeros((len(neg_triples),))
y_test = np.concatenate((arr1, arr2))

#shuffle positive and negative triples (optional)
all_triples, y_test = shuffle(all_triples, y_test)

#obtain the score aray
y_score = np.zeros((len(y_test),))

#implement the scoring
for i in range(len(all_triples)):
    
    s, r, t = all_triples[i][0], all_triples[i][1], all_triples[i][2]
    
    path_score = path_based_triple_scoring(s, r, t, lower_bound, upper_bound_path, one_hop_ind, id2relation, model)
    
    subg_score = subgraph_triple_scoring(s, r, t, lower_bound, upper_bound_subg, one_hop_ind, id2relation, model_2)
    
    ave_score = (path_score + subg_score)/float(2)
    
    y_score[i] = ave_score
    
    if i % 20 == 0 and i > 0:
        print('evaluating scores', i, len(all_triples))
        auc = metrics.roc_auc_score(y_test[:i], y_score[:i])
        auc_pr = metrics.average_precision_score(y_test[:i], y_score[:i])
        print('auc, auc-pr', auc, auc_pr)
        
print('evaluating scores', i, len(all_triples))
auc = metrics.roc_auc_score(y_test, y_score)
auc_pr = metrics.average_precision_score(y_test, y_score)
print('(final) auc, auc-pr', auc, auc_pr)

evaluating scores 20 956
auc, auc-pr 0.9494949494949495 0.9282627865961202
evaluating scores 40 956
auc, auc-pr 0.9525 0.9533532814346182
evaluating scores 60 956
auc, auc-pr 0.9321468298109009 0.9428056635257618
evaluating scores 80 956
auc, auc-pr 0.9324577861163227 0.9417730240468616
evaluating scores 100 956
auc, auc-pr 0.9424 0.9497106694290974
evaluating scores 120 956
auc, auc-pr 0.9393770856507231 0.9413000713705634
evaluating scores 140 956
auc, auc-pr 0.9418248622167789 0.950822201542001
evaluating scores 160 956
auc, auc-pr 0.9462525426380848 0.9556558803587307
evaluating scores 180 956
auc, auc-pr 0.9448697370045686 0.9507498996560426
evaluating scores 200 956
auc, auc-pr 0.933483709273183 0.9341209756267034
evaluating scores 220 956
auc, auc-pr 0.9404102224795303 0.9420597487211005
evaluating scores 240 956
auc, auc-pr 0.9362782608695652 0.9390333214877702
evaluating scores 260 956
auc, auc-pr 0.939435202178675 0.9434625214878005
evaluating scores 280 956
auc, auc-pr 0.943

In [34]:
######################################################
#obtain the Hits@N for entity prediction##############

#we select all the triples in the inductive test set
selected = list(data_ind_test)

###Hit at 1#############################
#generate the negative samples by randomly replace relation with all the other relaiton
Hits_at_1 = 0
Hits_at_3 = 0
Hits_at_10 = 0
MRR_raw = 0.

for i in range(len(selected)):
    
    triple_list = list()
    
    #score the true triple
    s_pos, r_pos, t_pos = selected[i][0], selected[i][1], selected[i][2]

    path_score = path_based_triple_scoring(s_pos, r_pos, t_pos, lower_bound, upper_bound_path, one_hop_ind, id2relation, model)

    subg_score = subgraph_triple_scoring(s_pos, r_pos, t_pos, lower_bound, upper_bound_subg, one_hop_ind, id2relation, model_2)
    
    ave_score = (path_score + subg_score)/float(2)
    
    triple_list.append([(s_pos, r_pos, t_pos), ave_score])
    
    #generate the 50 random samples
    for sub_i in range(50):
        
        #decide to replace the head or tail entity
        number_0 = random.uniform(0, 1)

        if number_0 < 0.5: #replace head entity
            
            s_neg = random.choice(list(new_ent_set))
            
            while ((s_neg, r_pos, t_pos) in data_test) or (
                   (s_neg, r_pos, t_pos) in data_valid) or (
                   (s_neg, r_pos, t_pos) in data) or (
                   (s_neg, r_pos, t_pos) in data_ind) or (
                   (s_neg, r_pos, t_pos) in data_ind_valid) or (
                   (s_neg, r_pos, t_pos) in data_ind_test):

                s_neg = random.choice(list(new_ent_set))
            
            path_score = path_based_triple_scoring(s_neg, r_pos, t_pos, lower_bound, upper_bound_path, one_hop_ind, id2relation, model)

            subg_score = subgraph_triple_scoring(s_neg, r_pos, t_pos, lower_bound, upper_bound_subg, one_hop_ind, id2relation, model_2)

            ave_score = (path_score + subg_score)/float(2)

            triple_list.append([(s_neg, r_pos, t_pos), ave_score])
            
        else: #replace tail entity

            t_neg = random.choice(list(new_ent_set))
            
            #filter out the existing triples
            while ((s_pos, r_pos, t_neg) in data_test) or (
                   (s_pos, r_pos, t_neg) in data_valid) or (
                   (s_pos, r_pos, t_neg) in data) or (
                   (s_pos, r_pos, t_neg) in data_ind) or (
                   (s_pos, r_pos, t_neg) in data_ind_valid) or (
                   (s_pos, r_pos, t_neg) in data_ind_test):

                t_neg = random.choice(list(new_ent_set))
            
            path_score = path_based_triple_scoring(s_pos, r_pos, t_neg, lower_bound, upper_bound_path, one_hop_ind, id2relation, model)

            subg_score = subgraph_triple_scoring(s_pos, r_pos, t_neg, lower_bound, upper_bound_subg, one_hop_ind, id2relation, model_2)

            ave_score = (path_score + subg_score)/float(2)

            triple_list.append([(s_pos, r_pos, t_neg), ave_score])
            
    #random shuffle!
    random.shuffle(triple_list)
    
    #sort
    sorted_list = sorted(triple_list, key = lambda x: x[-1], reverse=True)
    
    p = 0
    
    while p < len(sorted_list) and sorted_list[p][0] != (s_pos, r_pos, t_pos):
            
        p += 1
    
    if p == 0:
        
        Hits_at_1 += 1
        
    if p < 3:
        
        Hits_at_3 += 1
        
    if p < 10:
        
        Hits_at_10 += 1
        
    MRR_raw += 1./float(p + 1.) 
        
    print('checkcorrect', (s_pos, r_pos, t_pos), sorted_list[p][0],
          'real score', sorted_list[p][-1],
          'Hits@1', Hits_at_1/(i+1),
          'Hits@3', Hits_at_3/(i+1),
          'Hits@10', Hits_at_10/(i+1),
          'MRR', MRR_raw/(i+1),
          'rank', p,
          'total_num', i, len(selected))

checkcorrect (3130, 222, 3303) (3130, 222, 3303) real score 0.9677767187356949 Hits@1 1.0 Hits@3 1.0 Hits@10 1.0 MRR 1.0 rank 0 total_num 0 478
checkcorrect (3074, 68, 2956) (3074, 68, 2956) real score 0.0 Hits@1 0.5 Hits@3 0.5 Hits@10 0.5 MRR 0.5121951219512195 rank 40 total_num 1 478
checkcorrect (2856, 22, 3242) (2856, 22, 3242) real score 0.2143394649028778 Hits@1 0.3333333333333333 Hits@3 0.3333333333333333 Hits@10 0.3333333333333333 MRR 0.367104440275172 rank 12 total_num 2 478
checkcorrect (3056, 64, 3177) (3056, 64, 3177) real score 0.9504194200038909 Hits@1 0.25 Hits@3 0.5 Hits@10 0.5 MRR 0.3586616635397123 rank 2 total_num 3 478
checkcorrect (3641, 300, 2632) (3641, 300, 2632) real score 0.8722596347332001 Hits@1 0.4 Hits@3 0.6 Hits@10 0.6 MRR 0.4869293308317698 rank 0 total_num 4 478
checkcorrect (3000, 100, 3920) (3000, 100, 3920) real score 0.8299575954675674 Hits@1 0.3333333333333333 Hits@3 0.5 Hits@10 0.6666666666666666 MRR 0.4474411090264749 rank 3 total_num 5 478
check

checkcorrect (2744, 190, 3041) (2744, 190, 3041) real score 0.901010912656784 Hits@1 0.3333333333333333 Hits@3 0.5777777777777777 Hits@10 0.8444444444444444 MRR 0.5052123835428132 rank 0 total_num 44 478
checkcorrect (3859, 42, 2846) (3859, 42, 2846) real score 0.9430209010839463 Hits@1 0.32608695652173914 Hits@3 0.5869565217391305 Hits@10 0.8478260869565217 MRR 0.5050990708570999 rank 1 total_num 45 478
checkcorrect (2789, 190, 2609) (2789, 190, 2609) real score 0.960594755411148 Hits@1 0.3404255319148936 Hits@3 0.5957446808510638 Hits@10 0.851063829787234 MRR 0.5156288778601403 rank 0 total_num 46 478
checkcorrect (2698, 44, 2736) (2698, 44, 2736) real score 0.9151221334934234 Hits@1 0.3333333333333333 Hits@3 0.5833333333333334 Hits@10 0.8541666666666666 MRR 0.5083588317936096 rank 5 total_num 47 478
checkcorrect (3821, 12, 2677) (3821, 12, 2677) real score 0.9744822174310683 Hits@1 0.3469387755102041 Hits@3 0.5918367346938775 Hits@10 0.8571428571428571 MRR 0.5183923250223115 rank 0 

checkcorrect (3360, 192, 3359) (3360, 192, 3359) real score 0.38024601638317107 Hits@1 0.367816091954023 Hits@3 0.5862068965517241 Hits@10 0.8620689655172413 MRR 0.5297261710532699 rank 16 total_num 86 478
checkcorrect (3493, 216, 3622) (3493, 216, 3622) real score 0.9410114407539367 Hits@1 0.375 Hits@3 0.5909090909090909 Hits@10 0.8636363636363636 MRR 0.5350701918367554 rank 0 total_num 87 478
checkcorrect (3397, 300, 2632) (3397, 300, 2632) real score 0.8802166670560837 Hits@1 0.3707865168539326 Hits@3 0.5955056179775281 Hits@10 0.8651685393258427 MRR 0.534676144737466 rank 1 total_num 88 478
checkcorrect (2747, 48, 3288) (2747, 48, 3288) real score 0.9476373136043549 Hits@1 0.36666666666666664 Hits@3 0.5888888888888889 Hits@10 0.8666666666666667 MRR 0.5315130764626054 rank 3 total_num 89 478
checkcorrect (3284, 20, 3029) (3284, 20, 3029) real score 0.9579764306545258 Hits@1 0.37362637362637363 Hits@3 0.5934065934065934 Hits@10 0.8681318681318682 MRR 0.5366612844135658 rank 0 total_n

checkcorrect (2839, 12, 2838) (2839, 12, 2838) real score 0.9099085107445717 Hits@1 0.40625 Hits@3 0.609375 Hits@10 0.890625 MRR 0.5614991497449122 rank 0 total_num 127 478
checkcorrect (2840, 8, 2949) (2840, 8, 2949) real score 0.9783595919609069 Hits@1 0.40310077519379844 Hits@3 0.6124031007751938 Hits@10 0.8914728682170543 MRR 0.5610224121499904 rank 1 total_num 128 478
checkcorrect (2808, 100, 2626) (2808, 100, 2626) real score 0.9545291662216187 Hits@1 0.4 Hits@3 0.6153846153846154 Hits@10 0.8923076923076924 MRR 0.5605530089796059 rank 1 total_num 129 478
checkcorrect (2771, 164, 3080) (2771, 164, 3080) real score 0.9771295547485351 Hits@1 0.40458015267175573 Hits@3 0.6183206106870229 Hits@10 0.8931297709923665 MRR 0.5639075661629677 rank 0 total_num 130 478
checkcorrect (3071, 12, 2991) (3071, 12, 2991) real score 0.9710013836622238 Hits@1 0.4090909090909091 Hits@3 0.6212121212121212 Hits@10 0.8939393939393939 MRR 0.5672112967223392 rank 0 total_num 131 478
checkcorrect (4114, 84

checkcorrect (3249, 2, 3088) (3249, 2, 3088) real score 0.9528884530067443 Hits@1 0.378698224852071 Hits@3 0.6153846153846154 Hits@10 0.8994082840236687 MRR 0.5475430339066882 rank 1 total_num 168 478
checkcorrect (3730, 312, 2760) (3730, 312, 2760) real score 0.8430583566427231 Hits@1 0.3764705882352941 Hits@3 0.6176470588235294 Hits@10 0.9 MRR 0.5472633690013547 rank 1 total_num 169 478
checkcorrect (2663, 42, 3283) (2663, 42, 3283) real score 0.7281914830207825 Hits@1 0.3742690058479532 Hits@3 0.6140350877192983 Hits@10 0.8947368421052632 MRR 0.5445503278570973 rank 11 total_num 170 478
checkcorrect (2732, 12, 2747) (2732, 12, 2747) real score 0.9368451088666916 Hits@1 0.37790697674418605 Hits@3 0.6162790697674418 Hits@10 0.8953488372093024 MRR 0.5471982910672305 rank 0 total_num 171 478
checkcorrect (3696, 22, 3766) (3696, 22, 3766) real score 0.28414194732904435 Hits@1 0.37572254335260113 Hits@3 0.6127167630057804 Hits@10 0.8901734104046243 MRR 0.5444481770808799 rank 13 total_num

checkcorrect (2844, 6, 3062) (2844, 6, 3062) real score 0.9753952085971832 Hits@1 0.37142857142857144 Hits@3 0.6333333333333333 Hits@10 0.8952380952380953 MRR 0.5452811587745657 rank 1 total_num 209 478
checkcorrect (3994, 50, 3206) (3994, 50, 3206) real score 0.9626874655485154 Hits@1 0.3744075829383886 Hits@3 0.6350710900473934 Hits@10 0.8957345971563981 MRR 0.5474362243727905 rank 0 total_num 210 478
checkcorrect (2664, 106, 2663) (2664, 106, 2663) real score 0.848853000998497 Hits@1 0.37735849056603776 Hits@3 0.6367924528301887 Hits@10 0.8962264150943396 MRR 0.5495709591634849 rank 0 total_num 211 478
checkcorrect (4260, 172, 2644) (4260, 172, 2644) real score 0.7916168585419654 Hits@1 0.3755868544600939 Hits@3 0.6338028169014085 Hits@10 0.8967136150234741 MRR 0.5481645227354873 rank 3 total_num 212 478
checkcorrect (3622, 76, 2673) (3622, 76, 2673) real score 0.9845646232366563 Hits@1 0.37850467289719625 Hits@3 0.6355140186915887 Hits@10 0.897196261682243 MRR 0.5502759034703683 ra

checkcorrect (3941, 8, 3136) (3941, 8, 3136) real score 0.9555497318506241 Hits@1 0.3904382470119522 Hits@3 0.649402390438247 Hits@10 0.896414342629482 MRR 0.5604487018820359 rank 1 total_num 250 478
checkcorrect (3423, 12, 2730) (3423, 12, 2730) real score 0.8437927395105362 Hits@1 0.3888888888888889 Hits@3 0.6468253968253969 Hits@10 0.8968253968253969 MRR 0.5590183498904405 rank 4 total_num 251 478
checkcorrect (3603, 64, 2894) (3603, 64, 2894) real score 0.6929042130708695 Hits@1 0.38735177865612647 Hits@3 0.6442687747035574 Hits@10 0.8972332015810277 MRR 0.5572040481122174 rank 9 total_num 252 478
checkcorrect (4039, 108, 3095) (4039, 108, 3095) real score 0.717153100669384 Hits@1 0.3858267716535433 Hits@3 0.6417322834645669 Hits@10 0.8937007874015748 MRR 0.5553682411940948 rank 10 total_num 253 478
checkcorrect (2807, 2, 3653) (2807, 2, 3653) real score 0.8639773398637771 Hits@1 0.3843137254901961 Hits@3 0.6431372549019608 Hits@10 0.8941176470588236 MRR 0.555151110836471 rank 1 to

checkcorrect (3546, 384, 3290) (3546, 384, 3290) real score 0.2884449135512113 Hits@1 0.386986301369863 Hits@3 0.6472602739726028 Hits@10 0.9041095890410958 MRR 0.5584041419649871 rank 0 total_num 291 478
checkcorrect (3791, 70, 3349) (3791, 70, 3349) real score 0.941614705324173 Hits@1 0.3856655290102389 Hits@3 0.6484641638225256 Hits@10 0.9044368600682594 MRR 0.5576359822085651 rank 2 total_num 292 478
checkcorrect (3160, 20, 4088) (3160, 20, 4088) real score 0.8955231308937073 Hits@1 0.3843537414965986 Hits@3 0.6462585034013606 Hits@10 0.9047619047619048 MRR 0.5564195332894883 rank 4 total_num 293 478
checkcorrect (2789, 190, 3123) (2789, 190, 3123) real score 0.9531085401773453 Hits@1 0.3864406779661017 Hits@3 0.6474576271186441 Hits@10 0.9050847457627119 MRR 0.557923195888507 rank 0 total_num 294 478
checkcorrect (3258, 6, 2961) (3258, 6, 2961) real score 0.0 Hits@1 0.38513513513513514 Hits@3 0.6452702702702703 Hits@10 0.9054054054054054 MRR 0.5566013832897846 rank 5 total_num 295

checkcorrect (3108, 2, 3228) (3108, 2, 3228) real score 0.9592428654432297 Hits@1 0.3783783783783784 Hits@3 0.6606606606606606 Hits@10 0.9069069069069069 MRR 0.5568174360229156 rank 0 total_num 332 478
checkcorrect (3089, 2, 3452) (3089, 2, 3452) real score 0.9584137827157975 Hits@1 0.38023952095808383 Hits@3 0.6616766467065869 Hits@10 0.907185628742515 MRR 0.5581443299270387 rank 0 total_num 333 478
checkcorrect (3165, 22, 2703) (3165, 22, 2703) real score 0.8753755837678909 Hits@1 0.37910447761194027 Hits@3 0.6597014925373135 Hits@10 0.9074626865671642 MRR 0.5568513617780027 rank 7 total_num 334 478
checkcorrect (3492, 206, 3262) (3492, 206, 3262) real score 0.36809881031513214 Hits@1 0.37797619047619047 Hits@3 0.6607142857142857 Hits@10 0.9077380952380952 MRR 0.5566821612965205 rank 1 total_num 335 478
checkcorrect (3292, 66, 2755) (3292, 66, 2755) real score 0.9730338364839554 Hits@1 0.3798219584569733 Hits@3 0.6617210682492581 Hits@10 0.9080118694362018 MRR 0.5579976444974211 rank

checkcorrect (3966, 392, 2615) (3966, 392, 2615) real score 0.3398019939661026 Hits@1 0.39572192513368987 Hits@3 0.6737967914438503 Hits@10 0.9144385026737968 MRR 0.5703720078257051 rank 7 total_num 373 478
checkcorrect (2895, 22, 2703) (2895, 22, 2703) real score 0.9246615678071977 Hits@1 0.39466666666666667 Hits@3 0.6746666666666666 Hits@10 0.9146666666666666 MRR 0.5701843491381698 rank 1 total_num 374 478
checkcorrect (2646, 82, 2852) (2646, 82, 2852) real score 0.9907291024923325 Hits@1 0.3962765957446808 Hits@3 0.675531914893617 Hits@10 0.9148936170212766 MRR 0.5713274758691854 rank 0 total_num 375 478
checkcorrect (2942, 82, 2776) (2942, 82, 2776) real score 0.9903643637895584 Hits@1 0.3978779840848806 Hits@3 0.6763925729442971 Hits@10 0.9151193633952255 MRR 0.5724645382674104 rank 0 total_num 376 478
checkcorrect (2765, 42, 2764) (2765, 42, 2764) real score 0.9043088883161545 Hits@1 0.3968253968253968 Hits@3 0.6746031746031746 Hits@10 0.9153439153439153 MRR 0.5716114574783431 ra

checkcorrect (3350, 248, 3384) (3350, 248, 3384) real score 0.9167520999908447 Hits@1 0.3927710843373494 Hits@3 0.6843373493975904 Hits@10 0.9204819277108434 MRR 0.5719001816174895 rank 0 total_num 414 478
checkcorrect (4203, 100, 2850) (4203, 100, 2850) real score 0.9082189857959748 Hits@1 0.3918269230769231 Hits@3 0.6826923076923077 Hits@10 0.9206730769230769 MRR 0.5711263831039859 rank 3 total_num 415 478
checkcorrect (2638, 0, 3369) (2638, 0, 3369) real score 0.7349819734692573 Hits@1 0.3932853717026379 Hits@3 0.6834532374100719 Hits@10 0.920863309352518 MRR 0.5721548570054152 rank 0 total_num 416 478
checkcorrect (3930, 142, 2618) (3930, 142, 2618) real score 0.8645038396120072 Hits@1 0.39473684210526316 Hits@3 0.6842105263157895 Hits@10 0.9210526315789473 MRR 0.5731784099790864 rank 0 total_num 417 478
checkcorrect (3059, 12, 3434) (3059, 12, 3434) real score 0.9500266075134278 Hits@1 0.39618138424821003 Hits@3 0.684964200477327 Hits@10 0.9212410501193318 MRR 0.5741970772583727 r

checkcorrect (4078, 190, 3650) (4078, 190, 3650) real score 0.46453678905963897 Hits@1 0.40131578947368424 Hits@3 0.6842105263157895 Hits@10 0.918859649122807 MRR 0.576123004960887 rank 6 total_num 455 478
checkcorrect (3984, 322, 2635) (3984, 322, 2635) real score 0.3729188978672028 Hits@1 0.40043763676148797 Hits@3 0.6827133479212254 Hits@10 0.9190371991247265 MRR 0.575135864906268 rank 7 total_num 456 478
checkcorrect (2752, 66, 2751) (2752, 66, 2751) real score 0.9553016602993012 Hits@1 0.4017467248908297 Hits@3 0.6834061135371179 Hits@10 0.9192139737991266 MRR 0.5760635158562543 rank 0 total_num 457 478
checkcorrect (3088, 66, 3685) (3088, 66, 3685) real score 0.967854455113411 Hits@1 0.40305010893246185 Hits@3 0.6840958605664488 Hits@10 0.9193899782135077 MRR 0.5769871247541709 rank 0 total_num 458 478
checkcorrect (2624, 194, 3222) (2624, 194, 3222) real score 0.8773757666349411 Hits@1 0.4043478260869565 Hits@3 0.6847826086956522 Hits@10 0.9195652173913044 MRR 0.5779067179612271

#### Fine tuned

In [34]:
#function to build the big batche for path-based training
def build_big_batches_path(lower_bd, upper_bd, data, one_hop, s_t_r,
                      x_p_list, x_r_list, y_list,
                      relation2id, entity2id, id2relation, id2entity):
    
    #the set of all relation IDs
    relation_id_set = set()
    
    #the set of all initial relations
    ini_r_id_set = set()
    
    for i in range(len(id2relation)):
        
        if i not in id2relation:
            raise ValueError('error when generaing id2relation')
        
        relation_id_set.add(i)
        
        if i % 2 == 0: #initial relation id is always an even number
            ini_r_id_set.add(i)
    
    num_r = len(id2relation)
    num_ini_r = len(ini_r_id_set)
    
    if num_ini_r != int(num_r/2):
        raise ValueError('error when generating id2relation')
    
    #in case not all entities in entity2id are in one_hop, 
    #so we need to find out who are indeed in
    existing_ids = set()
    
    for s_1 in one_hop:
        existing_ids.add(s_1)
        
    existing_ids = list(existing_ids)
    random.shuffle(existing_ids)
    
    count = 0
    for s in existing_ids:
        
        #impliment the path finding algorithm to find paths between s and t
        result, length_dict = Class_2.obtain_paths('direct_neighbour', s, 'nb', lower_bd, upper_bd, one_hop)
        
        for iteration in range(10):

            #proceed only if at least three paths are between s and t
            for t in result:

                if len(s_t_r[(s,t)]) == 0:

                    raise ValueError(s,t,id2entity[s], id2entity[t])

                #we are only interested in forward link in relation prediciton
                ini_r_list = list()

                #obtain initial relations between s and t
                for r in s_t_r[(s,t)]:
                    if r % 2 == 0:#initial relation id is always an even number
                        ini_r_list.append(r)

                #if there exist more than three paths between s and t, 
                #and inital connection between s and t exists,
                #and not every r in the relation dictionary exists between s and t (although this is rare)
                #we then proceed
                if len(result[t]) >= 3 and len(ini_r_list) > 0 and len(ini_r_list) < int(num_ini_r):

                    #obtain the list form of all the paths from s to t
                    temp_path_list = list(result[t])

                    temp_pair = random.sample(temp_path_list, 3)

                    path_1, path_2, path_3 = temp_pair[0], temp_pair[1], temp_pair[2]

                    #####positive#####################
                    #append the paths: note that we add the space holder id at the end of the shorter path
                    x_p_list['1'].append(list(path_1) + [num_r]*abs(len(path_1)-upper_bd))
                    x_p_list['2'].append(list(path_2) + [num_r]*abs(len(path_2)-upper_bd))
                    x_p_list['3'].append(list(path_3) + [num_r]*abs(len(path_3)-upper_bd))

                    #append relation
                    r = random.choice(ini_r_list)
                    x_r_list.append([r])
                    y_list.append(1.)

                    #####negative#####################
                    #append the paths: note that we add the space holder id at the end
                    #of the shorter path
                    x_p_list['1'].append(list(path_1) + [num_r]*abs(len(path_1)-upper_bd))
                    x_p_list['2'].append(list(path_2) + [num_r]*abs(len(path_2)-upper_bd))
                    x_p_list['3'].append(list(path_3) + [num_r]*abs(len(path_3)-upper_bd))

                    #append relation
                    neg_r_list = list(ini_r_id_set.difference(set(ini_r_list)))
                    r_ran = random.choice(neg_r_list)
                    x_r_list.append([r_ran])
                    y_list.append(0.)
        
        count += 1
        if count % 100 == 0:
            print('generating big-batches for path-based model', count, len(existing_ids))

In [35]:
#Again, it is too slow to run the path-finding algorithm again and again on the complete FB15K-237
#Instead, we will find the subgraph for each entity once.
#then in the subgraph based training, the subgraphs are stored and used for multiple times
def store_subgraph_dicts(lower_bd, upper_bd, data, one_hop, s_t_r,
                         relation2id, entity2id, id2relation, id2entity):
    
    #the set of all relation IDs
    relation_id_set = set()
    
    for i in range(len(id2relation)):
        
        if i not in id2relation:
            raise ValueError('error when generaing id2relation')
        
        relation_id_set.add(i)
    
    num_r = len(id2relation)
    
    #in case not all entities in entity2id are in one_hop, 
    #so we need to find out who are indeed in
    existing_ids = set()
    
    for s_1 in one_hop:
        existing_ids.add(s_1)
    
    #the ids to start path finding
    existing_ids = list(existing_ids)
    random.shuffle(existing_ids)
    
    #Dict stores the subgraph for each entity
    Dict_1 = dict()
    
    count = 0
    for s in existing_ids:
        
        path_set = set()
            
        result, length_dict = Class_2.obtain_paths('any_target', s, 'any', lower_bd, upper_bd, one_hop)

        for t_ in result:
            for path in result[t_]:
                path_set.add(path)

        del(result, length_dict)
        
        path_list = list(path_set)
        
        path_select = random.sample(path_list, min(len(path_list), 100))
            
        Dict_1[s] = deepcopy(path_select)
        
        count += 1
        if count % 100 == 0:
            print('generating and storing paths for the path-based model', count, len(existing_ids))
        
    return(Dict_1)

In [36]:
#function to build the big-batch for one-hope neighbor training
def build_big_batches_subgraph(lower_bd, upper_bd, data, one_hop, s_t_r,
                      x_s_list, x_t_list, x_r_list, y_list, Dict,
                      relation2id, entity2id, id2relation, id2entity):
    
    #the set of all relation IDs
    relation_id_set = set()
    
    #the set of all initial relations
    ini_r_id_set = set()
    
    for i in range(len(id2relation)):
        
        if i not in id2relation:
            raise ValueError('error when generaing id2relation')
        
        relation_id_set.add(i)
        
        if i % 2 == 0: #initial relation id is always an even number
            ini_r_id_set.add(i)
    
    num_r = len(id2relation)
    num_ini_r = len(ini_r_id_set)
    
    if num_ini_r != int(num_r/2):
        raise ValueError('error when generating id2relation')
        
    #if an entity has at least three out-stretching paths, it is a qualified one
    qualified = set()
    for e in Dict:
        if len(Dict[e]) >= 6:
            qualified.add(e)
    qualified = list(qualified)
    
    data = list(data)
    
    for iteration in range(10):

        data = shuffle(data)

        for i_0 in range(len(data)):

            triple = data[i_0]

            s, r, t = triple[0], triple[1], triple[2] #obtain entities and relation IDs

            if s in qualified and t in qualified:

                #obtain the path list for true entities
                path_s, path_t = list(Dict[s]), list(Dict[t])

                #####positive step###########
                #randomly obtain three paths for true entities
                temp_s = random.sample(path_s, 6)
                temp_t = random.sample(path_t, 6)
                s_p_1, s_p_2, s_p_3, s_p_4, s_p_5, s_p_6 = temp_s[0], temp_s[1], temp_s[2], temp_s[3], temp_s[4], temp_s[5]
                t_p_1, t_p_2, t_p_3, t_p_4, t_p_5, t_p_6 = temp_t[0], temp_t[1], temp_t[2], temp_t[3], temp_t[4], temp_t[5]

                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                x_r_list.append([r])
                y_list.append(1.)

                #####negative step for relation###########
                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                neg_r_list = list(ini_r_id_set.difference({r}))
                r_ran = random.choice(neg_r_list)
                x_r_list.append([r_ran])
                y_list.append(0.)
                
                ##############################################
                ##############################################
                #randomly choose two negative sampled entities
                s_ran = random.choice(qualified)
                t_ran = random.choice(qualified)

                #obtain the path list for random entities
                path_s_ran, path_t_ran = list(Dict[s_ran]), list(Dict[t_ran])
                
                #####positive step#################
                #Again: randomly obtain three paths
                temp_s = random.sample(path_s, 6)
                temp_t = random.sample(path_t, 6)
                s_p_1, s_p_2, s_p_3, s_p_4, s_p_5, s_p_6 = temp_s[0], temp_s[1], temp_s[2], temp_s[3], temp_s[4], temp_s[5]
                t_p_1, t_p_2, t_p_3, t_p_4, t_p_5, t_p_6 = temp_t[0], temp_t[1], temp_t[2], temp_t[3], temp_t[4], temp_t[5]

                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                x_r_list.append([r])
                y_list.append(1.)

                #####negative for source entity###########
                #randomly obtain three paths
                temp_s = random.sample(path_s_ran, 6)
                s_p_1, s_p_2, s_p_3, s_p_4, s_p_5, s_p_6 = temp_s[0], temp_s[1], temp_s[2], temp_s[3], temp_s[4], temp_s[5]

                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                x_r_list.append([r])
                y_list.append(0.)

                #####positive step###########
                #Again: randomly obtain three paths
                temp_s = random.sample(path_s, 6)
                temp_t = random.sample(path_t, 6)
                s_p_1, s_p_2, s_p_3, s_p_4, s_p_5, s_p_6 = temp_s[0], temp_s[1], temp_s[2], temp_s[3], temp_s[4], temp_s[5]
                t_p_1, t_p_2, t_p_3, t_p_4, t_p_5, t_p_6 = temp_t[0], temp_t[1], temp_t[2], temp_t[3], temp_t[4], temp_t[5]

                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                x_r_list.append([r])
                y_list.append(1.)

                #####negative for target entity###########
                #randomly obtain three paths
                temp_t = random.sample(path_t_ran, 6)
                t_p_1, t_p_2, t_p_3, t_p_4, t_p_5, t_p_6 = temp_t[0], temp_t[1], temp_t[2], temp_t[3], temp_t[4], temp_t[5]

                #append the paths: note that we add the space holder id at the end of the shorter path
                x_s_list['1'].append(list(s_p_1) + [num_r]*abs(len(s_p_1)-upper_bd))
                x_s_list['2'].append(list(s_p_2) + [num_r]*abs(len(s_p_2)-upper_bd))
                x_s_list['3'].append(list(s_p_3) + [num_r]*abs(len(s_p_3)-upper_bd))
                x_s_list['4'].append(list(s_p_4) + [num_r]*abs(len(s_p_4)-upper_bd))
                x_s_list['5'].append(list(s_p_5) + [num_r]*abs(len(s_p_5)-upper_bd))
                x_s_list['6'].append(list(s_p_6) + [num_r]*abs(len(s_p_6)-upper_bd))

                x_t_list['1'].append(list(t_p_1) + [num_r]*abs(len(t_p_1)-upper_bd))
                x_t_list['2'].append(list(t_p_2) + [num_r]*abs(len(t_p_2)-upper_bd))
                x_t_list['3'].append(list(t_p_3) + [num_r]*abs(len(t_p_3)-upper_bd))
                x_t_list['4'].append(list(t_p_4) + [num_r]*abs(len(t_p_4)-upper_bd))
                x_t_list['5'].append(list(t_p_5) + [num_r]*abs(len(t_p_5)-upper_bd))
                x_t_list['6'].append(list(t_p_6) + [num_r]*abs(len(t_p_6)-upper_bd))

                #append relation
                x_r_list.append([r])
                y_list.append(0.)

            if i_0 % 2000 == 0:
                print('generating big-batches for subgraph-based model', i_0, len(data), iteration)

In [37]:
###fine tune the path-based model
lower_bd = lower_bound
upper_bd = upper_bound_path
batch_size = 32

#define the training lists
train_p_list, train_r_list, train_y_list = {'1': [], '2': [], '3': []}, list(), list()

#######################################
###build the big-batches###############      

#fill in the training array list
build_big_batches_path(lower_bd, upper_bd, data_ind, one_hop_ind, s_t_r_ind,
                      train_p_list, train_r_list, train_y_list,
                      relation2id, entity2id, id2relation, id2entity)   

#######################################
###do the training#####################

#generate the input arrays
x_train_1 = np.asarray(train_p_list['1'], dtype='int')
x_train_2 = np.asarray(train_p_list['2'], dtype='int')
x_train_3 = np.asarray(train_p_list['3'], dtype='int')
x_train_r = np.asarray(train_r_list, dtype='int')
y_train = np.asarray(train_y_list, dtype='int')

model.fit([x_train_1, x_train_2, x_train_3, x_train_r], y_train,
           batch_size=batch_size, epochs=5)

generating big-batches for path-based model 100 225
generating big-batches for path-based model 200 225
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9acdf6db20>

In [39]:
###fine tune the subgraph model
lower_bd = lower_bound
upper_bd = upper_bound_subg
batch_size = 32

Dict_train_ind = store_subgraph_dicts(lower_bd, upper_bd, data_ind, one_hop_ind, s_t_r_ind,
                         relation2id, entity2id, id2relation, id2entity)

#define the training lists
train_s_list, train_t_list, train_r_list, train_y_list = {'1': [], '2': [], '3': [], '4': [], '5': [], '6': []}, {'1': [], '2': [], '3': [], '4': [], '5': [], '6': []}, list(), list()

#######################################
###build the big-batches###############      

#fill in the training array list
build_big_batches_subgraph(lower_bd, upper_bd, data_ind, one_hop_ind, s_t_r_ind,
                      train_s_list, train_t_list, train_r_list, train_y_list, Dict_train_ind,
                      relation2id, entity2id, id2relation, id2entity)

#######################################
###do the training#####################

#generate the input arrays
x_train_s_1 = np.asarray(train_s_list['1'], dtype='int')
x_train_s_2 = np.asarray(train_s_list['2'], dtype='int')
x_train_s_3 = np.asarray(train_s_list['3'], dtype='int')
x_train_s_4 = np.asarray(train_s_list['4'], dtype='int')
x_train_s_5 = np.asarray(train_s_list['5'], dtype='int')
x_train_s_6 = np.asarray(train_s_list['6'], dtype='int')

x_train_t_1 = np.asarray(train_t_list['1'], dtype='int')
x_train_t_2 = np.asarray(train_t_list['2'], dtype='int')
x_train_t_3 = np.asarray(train_t_list['3'], dtype='int')
x_train_t_4 = np.asarray(train_t_list['4'], dtype='int')
x_train_t_5 = np.asarray(train_t_list['5'], dtype='int')
x_train_t_6 = np.asarray(train_t_list['6'], dtype='int')

x_train_r = np.asarray(train_r_list, dtype='int')
y_train = np.asarray(train_y_list, dtype='int')

model_2.fit([x_train_s_1, x_train_s_2, x_train_s_3, x_train_s_4, x_train_s_5, x_train_s_6,
             x_train_t_1, x_train_t_2, x_train_t_3, x_train_t_4, x_train_t_5, x_train_t_6,
             x_train_r], y_train,
             batch_size=batch_size, epochs=5)

generating and storing paths for the path-based model 100 225
generating and storing paths for the path-based model 200 225
generating big-batches for subgraph-based model 0 833 0
generating big-batches for subgraph-based model 0 833 1
generating big-batches for subgraph-based model 0 833 2
generating big-batches for subgraph-based model 0 833 3
generating big-batches for subgraph-based model 0 833 4
generating big-batches for subgraph-based model 0 833 5
generating big-batches for subgraph-based model 0 833 6
generating big-batches for subgraph-based model 0 833 7
generating big-batches for subgraph-based model 0 833 8
generating big-batches for subgraph-based model 0 833 9
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9a299a7550>

In [None]:
########################################################
#obtain the Hits@N for relation prediction##############

#we select all the triples in the inductive test set
selected = list(data_ind_test)

###Hit at 1#############################
#generate the negative samples by randomly replace relation with all the other relaiton
Hits_at_1 = 0
Hits_at_3 = 0
Hits_at_10 = 0
MRR_raw = 0.

for i in range(len(selected)):
    
    s_true, r_true, t_true = selected[i][0], selected[i][1], selected[i][2]
    
    #run the path-based scoring
    score_dict_path = path_based_relation_scoring(s_true, t_true, lower_bound, upper_bound_path, one_hop_ind, id2relation, model)
    
    #run the one-hop neighbour based scoring
    score_dict_subg = subgraph_relation_scoring(s_true, t_true, lower_bound, upper_bound_subg, one_hop_ind, id2relation, model_2)
    
    #final score dict
    score_dict = defaultdict(float)
    
    for r in score_dict_path:
        score_dict[r] += score_dict_path[r]
    for r in score_dict_subg:
        score_dict[r] += score_dict_subg[r]
    
    #[... [score, r], ...]
    temp_list = list()
    
    for r in id2relation:
        
        #again, we only care about initial relation prediciton
        if r % 2 == 0:
        
            if r in score_dict:

                temp_list.append([score_dict[r], r])

            else:

                temp_list.append([0.0, r])
        
    sorted_list = sorted(temp_list, key = lambda x: x[0], reverse=True)
    
    p = 0
    exist_tri = 0
    
    while p < len(sorted_list) and sorted_list[p][1] != r_true:
        
        #moreover, we want to remove existing triples
        if ((s_true, sorted_list[p][1], t_true) in data_test) or (
            (s_true, sorted_list[p][1], t_true) in data_valid) or (
            (s_true, sorted_list[p][1], t_true) in data) or (
            (s_true, sorted_list[p][1], t_true) in data_ind) or (
            (s_true, sorted_list[p][1], t_true) in data_ind_valid) or (
            (s_true, sorted_list[p][1], t_true) in data_ind_test):
            
            exist_tri += 1
            
        p += 1
    
    if p - exist_tri == 0:
        
        Hits_at_1 += 1
        
    if p - exist_tri < 3:
        
        Hits_at_3 += 1
        
    if p - exist_tri < 10:
        
        Hits_at_10 += 1
        
    MRR_raw += 1./float(p - exist_tri + 1.) 
        
    print('checkcorrect', r_true, sorted_list[p][1],
          'real score', sorted_list[p][0],
          'Hits@1', Hits_at_1/(i+1),
          'Hits@3', Hits_at_3/(i+1),
          'Hits@10', Hits_at_10/(i+1),
          'MRR', MRR_raw/(i+1),
          'cur_rank', p - exist_tri,
          'abs_cur_rank', p,
          'total_num', i, len(selected))

In [None]:
###########################################
##obtain the AUC-PR for the test triples###
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve
import matplotlib.pyplot as plt

#we select all the triples in the inductive test set
pos_triples = list(data_ind_test)

#we build the negative samples by randomly replace head or tail entity in the triple.
neg_triples = list()

for i in range(len(pos_triples)):
    
    s_pos, r_pos, t_pos = pos_triples[i][0], pos_triples[i][1], pos_triples[i][2]
    
    #decide to replace the head or tail entity
    number_0 = random.uniform(0, 1)
    
    if number_0 < 0.5: #replace head entity
        s_neg = random.choice(list(new_ent_set))
        
        #filter out the existing triples
        while ((s_neg, r_pos, t_pos) in data_test) or (
               (s_neg, r_pos, t_pos) in data_valid) or (
               (s_neg, r_pos, t_pos) in data) or (
               (s_neg, r_pos, t_pos) in data_ind) or (
               (s_neg, r_pos, t_pos) in data_ind_valid) or (
               (s_neg, r_pos, t_pos) in data_ind_test):
            
            s_neg = random.choice(list(new_ent_set))
        
        neg_triples.append((s_neg, r_pos, t_pos))
    
    else: #replace tail entity
        t_neg = random.choice(list(new_ent_set))
        
        #filter out the existing triples
        while ((s_pos, r_pos, t_neg) in data_test) or (
               (s_pos, r_pos, t_neg) in data_valid) or (
               (s_pos, r_pos, t_neg) in data) or (
               (s_pos, r_pos, t_neg) in data_ind) or (
               (s_pos, r_pos, t_neg) in data_ind_valid) or (
               (s_pos, r_pos, t_neg) in data_ind_test):
            
            t_neg = random.choice(list(new_ent_set))
        
        neg_triples.append((s_pos, r_pos, t_neg))

if len(pos_triples) != len(neg_triples):
    raise ValueError('error when generating negative triples')
        
#combine all triples
all_triples = pos_triples + neg_triples

#obtain the label array
arr1 = np.ones((len(pos_triples),))
arr2 = np.zeros((len(neg_triples),))
y_test = np.concatenate((arr1, arr2))

#shuffle positive and negative triples (optional)
all_triples, y_test = shuffle(all_triples, y_test)

#obtain the score aray
y_score = np.zeros((len(y_test),))

#implement the scoring
for i in range(len(all_triples)):
    
    s, r, t = all_triples[i][0], all_triples[i][1], all_triples[i][2]
    
    #path_score = path_based_triple_scoring(s, r, t, lower_bound, upper_bound_path, one_hop_ind, id2relation, model)
    
    subg_score = subgraph_triple_scoring(s, r, t, lower_bound, upper_bound_subg, one_hop_ind, id2relation, model_2)
    
    #ave_score = (path_score + subg_score)/float(2)
    
    #y_score[i] = ave_score
    y_score[i] = subg_score
    
    if i % 20 == 0 and i > 0:
        print('evaluating scores', i, len(all_triples))
        
        # Data to plot precision - recall curve
        precision, recall, thresholds = precision_recall_curve(y_test[:i], y_score[:i])
        # Use AUC function to calculate the area under the curve of precision recall curve
        auc_precision_recall = auc(recall, precision)
        print('AUC-PR is:', auc_precision_recall)
        
        
# Data to plot precision - recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
# Use AUC function to calculate the area under the curve of precision recall curve
auc_precision_recall = auc(recall, precision)
print('AUC-PR is:', auc_precision_recall)

In [None]:
######################################################
#obtain the Hits@N for entity prediction##############

#we select all the triples in the inductive test set
selected = list(data_ind_test)

###Hit at 1#############################
#generate the negative samples by randomly replace relation with all the other relaiton
Hits_at_1 = 0
Hits_at_3 = 0
Hits_at_10 = 0
MRR_raw = 0.

for i in range(len(selected)):
    
    triple_list = list()
    
    #score the true triple
    s_pos, r_pos, t_pos = selected[i][0], selected[i][1], selected[i][2]

    #path_score = path_based_triple_scoring(s_pos, r_pos, t_pos, lower_bound, upper_bound_path, one_hop_ind, id2relation, model)

    subg_score = subgraph_triple_scoring(s_pos, r_pos, t_pos, lower_bound, upper_bound_subg, one_hop_ind, id2relation, model_2)
    
    #ave_score = (path_score + subg_score)/float(2)
    
    triple_list.append([(s_pos, r_pos, t_pos), subg_score])
    
    #generate the 50 random samples
    for sub_i in range(50):
        
        #decide to replace the head or tail entity
        number_0 = random.uniform(0, 1)

        if number_0 < 0.5: #replace head entity
            
            s_neg = random.choice(list(new_ent_set))
            
            while ((s_neg, r_pos, t_pos) in data_test) or (
                   (s_neg, r_pos, t_pos) in data_valid) or (
                   (s_neg, r_pos, t_pos) in data) or (
                   (s_neg, r_pos, t_pos) in data_ind) or (
                   (s_neg, r_pos, t_pos) in data_ind_valid) or (
                   (s_neg, r_pos, t_pos) in data_ind_test):

                s_neg = random.choice(list(new_ent_set))
            
            #path_score = path_based_triple_scoring(s_neg, r_pos, t_pos, lower_bound, upper_bound_path, one_hop_ind, id2relation, model)

            subg_score = subgraph_triple_scoring(s_neg, r_pos, t_pos, lower_bound, upper_bound_subg, one_hop_ind, id2relation, model_2)

            #ave_score = (path_score + subg_score)/float(2)

            triple_list.append([(s_neg, r_pos, t_pos), subg_score])
            
        else: #replace tail entity

            t_neg = random.choice(list(new_ent_set))
            
            #filter out the existing triples
            while ((s_pos, r_pos, t_neg) in data_test) or (
                   (s_pos, r_pos, t_neg) in data_valid) or (
                   (s_pos, r_pos, t_neg) in data) or (
                   (s_pos, r_pos, t_neg) in data_ind) or (
                   (s_pos, r_pos, t_neg) in data_ind_valid) or (
                   (s_pos, r_pos, t_neg) in data_ind_test):

                t_neg = random.choice(list(new_ent_set))
            
            #path_score = path_based_triple_scoring(s_pos, r_pos, t_neg, lower_bound, upper_bound_path, one_hop_ind, id2relation, model)

            subg_score = subgraph_triple_scoring(s_pos, r_pos, t_neg, lower_bound, upper_bound_subg, one_hop_ind, id2relation, model_2)

            #ave_score = (path_score + subg_score)/float(2)

            triple_list.append([(s_pos, r_pos, t_neg), subg_score])
            
    #random shuffle!
    random.shuffle(triple_list)
    
    #sort
    sorted_list = sorted(triple_list, key = lambda x: x[-1], reverse=True)
    
    p = 0
    
    while p < len(sorted_list) and sorted_list[p][0] != (s_pos, r_pos, t_pos):
            
        p += 1
    
    if p == 0:
        
        Hits_at_1 += 1
        
    if p < 3:
        
        Hits_at_3 += 1
        
    if p < 10:
        
        Hits_at_10 += 1
        
    MRR_raw += 1./float(p + 1.) 
        
    print('checkcorrect', (s_pos, r_pos, t_pos), sorted_list[p][0],
          'real score', sorted_list[p][-1],
          'Hits@1', Hits_at_1/(i+1),
          'Hits@3', Hits_at_3/(i+1),
          'Hits@10', Hits_at_10/(i+1),
          'MRR', MRR_raw/(i+1),
          'rank', p,
          'total_num', i, len(selected))