In [1]:
import os
import sys
import numpy as np
import random
from collections import defaultdict
from copy import deepcopy

In [2]:
class LoadKG:
    
    def __init__(self):
        
        self.x = 'Hello'
        
    def load_train_data(self, data_path, one_hop, data, s_t_r, entity2id, id2entity,
                     relation2id, id2relation):
        
        data_ = set()
    
        ####load the train, valid and test set##########
        with open (data_path, 'r') as f:
            
            data_ini = f.readlines()
                        
            for i in range(len(data_ini)):
            
                x = data_ini[i].split()
                
                x_ = tuple(x)
                
                data_.add(x_)
        
        ####relation dict#################
        index = len(relation2id)
     
        for key in data_:
            
            if key[1] not in relation2id:
                
                relation = key[1]
                
                relation2id[relation] = index
                
                id2relation[index] = relation
                
                index += 1
                
                #the inverse relation
                iv_r = '_inverse_' + relation
                
                relation2id[iv_r] = index
                
                id2relation[index] = iv_r
                
                index += 1
        
        #get the id of the inverse relation, by above definition, initial relation has 
        #always even id, while inverse relation has always odd id.
        def inverse_r(r):
            
            if r % 2 == 0: #initial relation
                
                iv_r = r + 1
            
            else: #inverse relation
                
                iv_r = r - 1
            
            return(iv_r)
        
        ####entity dict###################
        index = len(entity2id)
        
        for key in data_:
            
            source, target = key[0], key[2]
            
            if source not in entity2id:
                                
                entity2id[source] = index
                
                id2entity[index] = source
                
                index += 1
            
            if target not in entity2id:
                
                entity2id[target] = index
                
                id2entity[index] = target
                
                index += 1
                
        #create the set of triples using id instead of string        
        for ele in data_:
            
            s = entity2id[ele[0]]
            
            r = relation2id[ele[1]]
            
            t = entity2id[ele[2]]
            
            if (s,r,t) not in data:
                
                data.add((s,r,t))
            
            s_t_r[(s,t)].add(r)
            
            if s not in one_hop:
                
                one_hop[s] = dict()
            
            if r not in one_hop[s]:
                
                one_hop[s][r] = set()
            
            one_hop[s][r].add(t)
            
            if t not in one_hop:
                
                one_hop[t] = dict()
            
            r_inv = inverse_r(r)
            
            s_t_r[(t,s)].add(r_inv)
            
            if r_inv not in one_hop[t]:
                
                one_hop[t][r_inv] = set()
            
            one_hop[t][r_inv].add(s)
            
        return(data_)

In [3]:
class ObtainPathsByDynamicProgramming:

    def __init__(self, size_bd=50, threshold=5000):
                
        self.size_bd = size_bd
        
        self.threshold = threshold
    
    '''
    Given an entity s, here is the function to find:
      1. any else entity t that is directely connected to s
      2. most of the paths from s to each t with length L
    
    One may refer to LeetCode Problem 797 for details:
        https://leetcode.com/problems/all-paths-from-source-to-target/
    '''
    def obtain_paths(self, mode, s, t_input, lower_bd, upper_bd, one_hop):

        if type(lower_bd) != type(1) or lower_bd < 1:
            
            raise TypeError("!!! invalid lower bound setting, must >= 1 !!!")
            
        if type(upper_bd) != type(1) or upper_bd < 1:
            
            raise TypeError("!!! invalid upper bound setting, must >= 1 !!!")
            
        if lower_bd > upper_bd:
            
            raise TypeError("!!! lower bound must not exced upper bound !!!")
            
        if s not in one_hop:
            
            raise ValueError('!!! entity not in one_hop. Please work on active entities for validation')
        
        #here is the result dict. Its key is each entity t that is directly connected to s
        #The value of each t is a set containing the paths from s to t
        #These paths can be either the direct connection r, or a multi-hop path
        res = defaultdict(set)
        
        #direct_nb contains all the direct neighbour of s
        direct_nb = set()
        
        if mode == 'direct_neighbour':
        
            for r in one_hop[s]:
            
                for t in one_hop[s][r]:
                
                    direct_nb.add(t)
                    
        elif mode == 'target_specified':
            
            direct_nb.add(t_input)
            
        elif mode == 'any_target':
            
            for s_any in one_hop:
                
                direct_nb.add(s_any)
                
        else:
            
            raise ValueError('not a valid mode')
        
        '''
        We use recursion to find the paths
        On current node with the path [r1, ..., rk] and on-path entities {e1, ..., ek-1, node}
        from s to this node, we further find the direct neighbor t' of this node. 
        If t' is not a on-path entity (not among e1,...ek-1), we recursively proceed to t' 
        '''
        def helper(node, path, on_path_en, res, direct_nb, lower_bd, upper_bd, one_hop, length_dict, count_dict):
            
            #when the current path is within lower_bd and upper_bd and its corresponding
            #length still within the size_bd and its tail node is within the note dict, 
            #we will then intend to add this path
            if (len(path) >= lower_bd) and (len(path) <= upper_bd) and (
                node in direct_nb) and (length_dict[len(path)] < self.size_bd):
                
                #if this path already exists between the source entity and the current target node,
                #we will not count it.
                #here is an interesting situation: this path may exist between s and some other node t,
                #however, it does not exist between s and this node t. Then, we still count it: length_dict[len(path)] += 1
                #That is, each path may be counted for multiple times.
                #We count how many paths we "actually" found between entity pairs
                #Same type of path between different entity pairs are count separately.
                if tuple(path) not in res[node]:
                
                    res[node].add(tuple(path))
                
                    length_dict[len(path)] += 1
                
            #For some rare entities, we may face such a case: so many paths are evaluated,
            #but no entities on the paths are direct neighbors of the rare entity.
            #In this case, the recursion cannot be bounded and stoped by the size threshold.
            #In order to cure this, we count how many times the recursion happens on a specific length, using the count_dict.
            #Its key is length, value counts the recursion occurred to that length. 
            #The recursion is forced to stop for that length (and hence for longer lengths) once reach the threshold.
            if (len(path) < upper_bd) and (length_dict[len(path) + 1] < self.size_bd) and (
                count_dict[len(path)] <= self.threshold):
                
                #we randomly shuffle relation r so that the reading in order is not fixed
                temp_list = list()
                
                for r in one_hop[node]:
                    
                    temp_list.append(r)
                
                for i_0 in range(len(temp_list)):
                    
                    if count_dict[len(path)] > self.threshold:
                        break
                    
                    r = random.choice(temp_list)
                    
                    for i_1 in range(len(one_hop[node][r])):
                        
                        if count_dict[len(path)] > self.threshold:
                            break
                        
                        t = random.choice(list(one_hop[node][r]))
                        
                        if t not in on_path_en:
                                
                            count_dict[len(path)] += 1

                            helper(t, path + [r], on_path_en.union({t}), res, direct_nb, 
                                   lower_bd, upper_bd, one_hop, length_dict, count_dict)
        
        length_dict = defaultdict(int)
        count_dict = defaultdict(int)
        
        helper(s, [], {s}, res, direct_nb, lower_bd, upper_bd, one_hop, length_dict, count_dict)
        
        return(res, length_dict)

In [4]:
#load the classes
Class_1 = LoadKG()
Class_2 = ObtainPathsByDynamicProgramming()

In [5]:
#define the dictionaries and sets for load KG
one_hop = dict() 
data = set()
s_t_r = defaultdict(set)
entity2id = dict()
id2entity = dict()
relation2id = dict()
id2relation = dict()

#fill in the sets and dicts
data_ini = Class_1.load_train_data('../data/fb237_v4/train.txt', one_hop, data, s_t_r,
                        entity2id, id2entity, relation2id, id2relation)

In [6]:
len(relation2id)

438

In [7]:
selectable = {i for i in id2relation}
selectable

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [8]:
#check the correctness of loading training data
for ele in data:
    s,r,t = ele[0], ele[1], ele[2]
    source = id2entity[s]
    target = id2entity[t]
    if r % 2 == 0:
        relation = id2relation[r]
        if (source, relation, target) not in data_ini:
            print('error')
    else:
        r_ini = r - 1
        relation = id2relation[r_ini]
        if (target, relation, source) not in data_ini:
            print('error')

In [9]:
#check the correctness of loading validation data
for ele in data_valid:
    s,r,t = ele[0], ele[1], ele[2]
    source = id2entity[s]
    relation = id2relation[r]
    target = id2entity[t]
    if r % 2 == 0:
        if (source, relation, target) not in data_ini_valid:
            print('error')
    else:
        r_ini = r - 1
        relation = id2relation[r_ini]
        if (target, relation, source) not in data_ini_valid:
            print('error')

NameError: name 'data_valid' is not defined

In [9]:
#check the correctness of training one_hop
for s in one_hop:
    for r in one_hop[s]:
        for t in one_hop[s][r]:
            source = id2entity[s]
            target = id2entity[t]
            if r % 2 == 0:
                relation = id2relation[r]
                if (source, relation, target) not in data_ini:
                    print('error')
            else:
                r_ini = r - 1
                relation = id2relation[r_ini]
                if (target, relation, source) not in data_ini:
                    print('error')

In [33]:
#check the correctness of validation one_hop
for s in one_hop_valid:
    for r in one_hop_valid[s]:
        for t in one_hop_valid[s][r]:
            source = id2entity[s]
            target = id2entity[t]
            if r % 2 == 0:
                relation = id2relation[r]
                if (source, relation, target) not in data_ini_valid:
                    print('error')
            else:
                r_ini = r - 1
                relation = id2relation[r_ini]
                if (target, relation, source) not in data_ini_valid:
                    print('error')

NameError: name 'one_hop_valid' is not defined

In [10]:
###let's do some checking to see the correctness of training path finding####
result, length_dict = Class_2.obtain_paths('any_target', 10, 'not_specified', 1, 10, one_hop)

In [11]:
result

defaultdict(set,
            {392: {(92,)},
             393: {(92, 93),
              (92, 93, 6),
              (92, 93, 39),
              (92, 93, 64),
              (92, 93, 65)},
             2092: {(39,),
              (65,),
              (92, 93, 38),
              (92, 93, 39),
              (92, 93, 64),
              (92, 93, 65)},
             173: {(92, 93, 39, 306), (92, 324), (306,)},
             502: {(92, 93, 39, 306, 131), (92, 93, 39, 306, 131, 86, 87)},
             3347: {(92, 93, 39, 306, 131, 330), (92, 93, 39, 306, 307)},
             1780: {(92, 93, 39, 306, 131, 330, 110)},
             218: {(92, 93, 39, 306, 131, 330, 110, 1)},
             2571: {(92, 93, 39, 306, 131, 330, 110, 1, 50)},
             1358: {(92, 93, 39, 306, 131, 330, 110, 1, 50, 16)},
             93: {(92, 93, 39, 306, 131, 330, 110, 1, 50, 56)},
             2029: {(92, 93, 39, 306, 131, 330, 110, 1, 50, 3),
              (92, 93, 39, 306, 131, 330, 110, 1, 50, 135)},
             1037

In [12]:
length_dict

defaultdict(int,
            {1: 11,
             2: 200,
             3: 200,
             4: 200,
             5: 200,
             6: 200,
             7: 200,
             8: 200,
             9: 200,
             10: 200})

In [13]:
s_t_r[(10, 1420)]

set()

In [45]:
def check_len_3(s, a, b, c, t_final, one_hop, Set):
    if a in one_hop[s]:
        for t in one_hop[s][a]:
            if b in one_hop[t]:
                for t_ in one_hop[t][b]:
                    if c in one_hop[t_]:
                        if t_final in one_hop[t_][c]:
                            Set.add('true')
                        
Set = set()
check_len_3(10, 1, 2, 0, 1411, one_hop, Set)
Set

set()

In [46]:
def check_len_2(s, a, b, t_final, one_hop, Set):
    if a in one_hop[s]:
        for t in one_hop[s][a]:
            if b in one_hop[t]:
                if t_final in one_hop[t][b]:
                    Set.add('true')
                
Set = set()
check_len_2(10, 1, 0, 3237, one_hop, Set)
Set

set()

### Build the deep neural network structure

We use biLSTM to train on the input path embedding sequence to predict the output embedding or the relation.

In [118]:
# Input layer, using integer to represent each relation type
#note that inputs_path is the path inputs, while inputs_out_re is the output relation inputs
inputs_path = keras.Input(shape=(None,), dtype="int32")
inputs_out_re = keras.Input(shape=(None,), dtype="int32")

# Embed each integer in a 300-dimensional vector as input
in_embd = layers.Embedding(len(relation2id), 300)(inputs_path)

# Embed each integer in a 300-dimensional vector as output
ot_embd = layers.Embedding(len(relation2id), 300)(inputs_out_re)

#add 2 layer bi-directional LSTM
mid_lstm = layers.Bidirectional(layers.LSTM(150, return_sequences=True))(in_embd)
out_lstm = layers.Bidirectional(layers.LSTM(150, return_sequences=True))(mid_lstm)

#sum into an embedding
sum_lstm = tf.reduce_sum(out_lstm, axis=1)

#remove the time dimension from the output embd since there is only one step
sum_ot_embd = tf.reduce_sum(ot_embd, axis=1)

#concatenate the lstm output and output embd
concat = layers.concatenate([sum_lstm, sum_ot_embd], axis=1)

#add the dense layer
dense_1 = layers.Dense(32, activation='relu')(concat)
batch_norm = layers.BatchNormalization()(dense_1)
dropout = layers.Dropout(0.5)(batch_norm)

#final layer
final_out = layers.Dense(2, activation='softmax')(dropout)

#put together the model
model = keras.Model([inputs_path, inputs_out_re], final_out)

#compile the model
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer="Adam",
    metrics=["accuracy"],
)

NameError: name 'keras' is not defined

### Start the training
We build each big-batch containing the paths of the same length. Then, we iteratively train on different big-batches with length 2, 3, ... length_bound, which is one epoch. The length of each big-batch is N. And of course, another N size big-batch will hold the corresponding output relations.

To be specific: 
* We will first build all the big-batches before fitting the NN model. 
* That is, we will perform the ObtainPathsByDynamicProgramming class function for some randomly chosen source entities. Their corresponding target entities and paths are filled in each big-batch according to the length. In the meanwhile, the corresponding big-batch holding output relation (true relation or negative samples) will be filled in accordingly.
* Once all N slot a big-batch are filled, that big-batch will be skipped in filling.
* Given then s and t entities, at most M paths between them will be randomly chosen.
* If all the paths obtained from all the s entities are used up, ObtainPathsByDynamicProgramming will be called again with new randomly chosen s and corresponding t and paths. 
* Do this until all the slots in all big-batchs are filled.
* In every epoch, big-batchs will be re-filled.

Then, in the training, we will use negative sampling: In each batch (actual batch, not the big-batch), we will include K true output relation embeddings and K random selected output relation embeddings. The true label is [1,0], while the false label is [0,1].

In [18]:
#we create a connection dictionary: key is each path, 
#value is a set containing all relations that occurred with this path.
#That is: if X, r1,r2...rn, Y is a path from X to Y, and X,r,Y is a triple,
#then in this dictionary, we have, (r1,r2...rn): {..., r, ...}
#we build this dictionary to avoid false negative when generating negative samples

##############################################
#function to filling the connect_dict by enough amount of iterations
def fill_in_connect_dictionary(connect_dict, iterations, lower_bd, upper_bd, Class_2, one_hop, s_t_r, 
                               relation2id, entity2id, id2relation, id2entity):
    
    #for validation case: not all entities in entity2id are in one_hop, 
    #so we need to find out who are in
    existing_ids = set()
    
    for s_1 in one_hop:
        existing_ids.add(s_1)
        
    existing_ids = list(existing_ids)

    #filling the connect_dict by enough amount of iterations
    for iteration in range(iterations):
        
        position = iteration % len(existing_ids)
        
        source_id = existing_ids[position]

        result, length_dict = Class_2.obtain_paths('direct_neighbour', source_id, 
                                                   'not_specified', lower_bd, upper_bd, one_hop)
        for target_id in result:
            
            if len(s_t_r[(source_id, target_id)]) > 0:
            
                for path in result[target_id]:

                    connect_dict[path] = connect_dict[path].union(s_t_r[(source_id, target_id)])
                
        if iteration % 1000 == 0:
            
            print('filling connect_dict', iteration, iterations)

In [36]:
#function for filling big-batch lists
def fill_in_big_batch_lists(mode, existing_ids, x_p_lists, x_r_lists, y_lists, len_record,
                            half_len, connect_dict, path_set, lower_bd, length, Class_2, one_hop, s_t_r, 
                            relation2id, entity2id, id2relation, id2entity, epoch):

    carry_on = True
    total_count = 0
    slot_checkpoint = -1
    
    while carry_on:

        source_id = random.choice(existing_ids)
        
        if mode == 'positive':

            result, length_dict = Class_2.obtain_paths('direct_neighbour', source_id, 
                                                       'not_specified', lower_bd, length, one_hop)
        elif mode == 'negative':
            
            result, length_dict = Class_2.obtain_paths('any_target', source_id, 
                                                       'not_specified', lower_bd, length, one_hop)
        else:
            
            raise ValueError('undefined mode beyond positive batch generation and negative batch generation')
            
        for target_id in result:

            if not carry_on:
                break
            
            #for positive mode, (source_id, target_id) guaranteed to be in s_t_r
            if (source_id, target_id) in s_t_r:
                dir_r = list(s_t_r[(source_id, target_id)])

            for path in result[target_id]:

                if not carry_on:
                    break

                #update the connection dictionary all the time ^_^
#                if mode == 'positive':
#                    connect_dict[path].update(s_t_r[(source_id, target_id)])

                path_len = len(path)

                if (path_len >= lower_bd) and (len_record[path_len] < half_len) and (
                    path not in path_set[path_len]):

                    slot = deepcopy(len_record[path_len])
                    
                    if mode == 'positive':
                        
                        x_p_lists[path_len].append(path)
                        
                        #under this case, dir must exist
                        relation_id = random.choice(dir_r)
                        x_r_lists[path_len].append(relation_id)
                        
                        y_lists[path_len].append(1.)
                        
                        len_record[path_len] += 1
                        path_set[path_len].add(path)
                    
                    else:
                        
                        selectable = {i for i in id2relation}
                        
#                        if path in connect_dict:
#                            selectable = selectable.difference(connect_dict[path])
                            
                        if (source_id, target_id) in s_t_r:
                            selectable = selectable.difference(s_t_r[(source_id, target_id)])
                        
                        if len(selectable) > 0:
                            x_p_lists[path_len].append(path)
                            
                            relation_id = random.choice(list(selectable))
                            x_r_lists[path_len].append(relation_id)
                            
                            y_lists[path_len].append(0.)
                            
                            len_record[path_len] += 1
                            path_set[path_len].add(path)
                
                #here we use the slot_checkpoint to avoid repatitive printing counting result
                if len_record[length] % 1000 == 0 and len_record[length] != slot_checkpoint:

                    print('generating the', mode, 'half', length, len_record[length], half_len, 'for epoch', epoch)

                    slot_checkpoint = len_record[length]

                if len_record[length] >= half_len:

                    carry_on = False

        total_count += 1

        if total_count >= 3*len(entity2id):

            carry_on = False

In [37]:
#function to build all the big batches
def build_big_batches(holder_len, connect_dict, lower_bd, upper_bd, Class_2, one_hop, s_t_r, 
                      relation2id, entity2id, id2relation, id2entity, epoch):
    
    if holder_len % 10 != 0:
        raise ValueError('We would like to take 10X as a big-batch size')
    
    #we first append the positive half, then negative half. After that we shuffle the list
    half_len = int(holder_len/2)
    
    #90% to be train, 10% to be validation
    train_len = 9*int(holder_len/10)
    valid_len = int(holder_len/10)
    
    #first create the train holder arrays
    x_p_train = {length: np.zeros((train_len, length), dtype=int) for length in range(lower_bd, upper_bd+1)} 
    x_r_train = {length: np.zeros((train_len, 1), dtype=int) for length in range(lower_bd, upper_bd+1)}
    y_train = {length: np.zeros((train_len,)) for length in range(lower_bd, upper_bd+1)}

    #first create the validation holder arrays
    x_p_valid = {length: np.zeros((valid_len, length), dtype=int) for length in range(lower_bd, upper_bd+1)} 
    x_r_valid = {length: np.zeros((valid_len, 1), dtype=int) for length in range(lower_bd, upper_bd+1)}
    y_valid = {length: np.zeros((valid_len,)) for length in range(lower_bd, upper_bd+1)}
    
    #when filling in big-batch for the first time, we use list instead of the arrays directly
    x_p_lists = {length: [] for length in range(lower_bd, upper_bd+1)} 
    x_r_lists = {length: [] for length in range(lower_bd, upper_bd+1)}
    y_lists = {length: [] for length in range(lower_bd, upper_bd+1)}
    
    #create a record list recording the current slot for each half batch (pos/neg)
    pos_record = {length: 0 for length in range(lower_bd, upper_bd+1)}
    neg_record = {length: 0 for length in range(lower_bd, upper_bd+1)}
    
    #for validation case: not all entities in entity2id are in one_hop, 
    #so we need to find out who are in
    existing_ids = set()
    
    for s_1 in one_hop:
        existing_ids.add(s_1)
        
    existing_ids = list(existing_ids)
    
    #we want to remove duplicated paths in the big-batch, so use path_set to keep unique ones
    path_set = defaultdict(set)

    #start filling by reading the ObtainPathsByDynamicProgramming
    #here, we start from the longest length and drop the length iteratively
    #in this way, lower length will be pre-filled in in previous iterations for higher lengths
    #so the previous running will not be wasted and the entire filling speed is much faster
    for length in range(upper_bd, lower_bd-1, -1):
        
        fill_in_big_batch_lists('positive', existing_ids, x_p_lists, x_r_lists, y_lists, pos_record,
                                half_len, connect_dict, path_set, lower_bd, length, Class_2, one_hop, s_t_r, 
                                relation2id, entity2id, id2relation, id2entity, epoch)
        
        fill_in_big_batch_lists('negative', existing_ids, x_p_lists, x_r_lists, y_lists, neg_record,
                                half_len, connect_dict, path_set, lower_bd, length, Class_2, one_hop, s_t_r, 
                                relation2id, entity2id, id2relation, id2entity, epoch)

    #shuffle the list and fill-in the arrays
    for length in range(upper_bd, lower_bd-1, -1):
        
        if (len(x_p_lists[length]) != len(x_r_lists[length])) or (
            len(x_r_lists[length]) != len(y_lists[length])) or (
            len(x_p_lists[length]) != pos_record[length] + neg_record[length]):
            
            raise ValueError('list of the same path-length does not in the same size')
        
        id_list = [i for i in range(holder_len)]
        
        random.shuffle(id_list)
                
        for i in range(train_len):
            
            ID = id_list[i]
            
            path = deepcopy(x_p_lists[length][ID])
            relation_id = deepcopy(x_r_lists[length][ID])
            y_label = deepcopy(y_lists[length][ID])
            
            x_p_train[length][i] = np.asarray(path)
            x_r_train[length][i] = relation_id
            y_train[length][i] = y_label
            
        for i in range(valid_len):
            
            ID = id_list[train_len + i]
            
            path = deepcopy(x_p_lists[length][ID])
            relation_id = deepcopy(x_r_lists[length][ID])
            y_label = deepcopy(y_lists[length][ID])
            
            x_p_valid[length][i] = np.asarray(path)
            x_r_valid[length][i] = relation_id
            y_valid[length][i] = y_label
                
    return(x_p_train, x_r_train, y_train, x_p_valid, x_r_valid, y_valid, pos_record, neg_record)

In [39]:
connect_dict = defaultdict(set)

fill_in_connect_dictionary(
    connect_dict, 0, 2, 4, Class_2, one_hop, s_t_r, 
    relation2id, entity2id, id2relation, id2entity)

In [40]:
x_p_train, x_r_train, y_train, x_p_valid, x_r_valid, y_valid, pos_record, neg_record = build_big_batches(
    1000, connect_dict, 2, 7, Class_2, one_hop, s_t_r, 
    relation2id, entity2id, id2relation, id2entity, 0)

generating the negative half 7 0 500 for epoch 0


In [41]:
pos_record

{2: 500, 3: 500, 4: 500, 5: 500, 6: 500, 7: 500}

In [42]:
neg_record

{2: 500, 3: 500, 4: 500, 5: 500, 6: 500, 7: 500}

In [43]:
temp = set()
for ele in x_p_train[7]:
    temp.add(tuple(ele))

len(x_p_train[4]), len(temp)

(900, 900)

In [34]:
temp

{(2, 37, 68, 21),
 (2, 37, 68, 69),
 (2, 37, 68, 247),
 (2, 110, 93, 51),
 (2, 110, 175, 61),
 (2, 110, 214, 329),
 (2, 245, 111, 200),
 (2, 245, 111, 362),
 (2, 245, 111, 363),
 (2, 245, 207, 1),
 (24, 33, 1, 25),
 (24, 33, 32, 25),
 (24, 33, 42, 43),
 (24, 33, 44, 45),
 (24, 33, 94, 69),
 (24, 33, 94, 95),
 (24, 33, 94, 247),
 (24, 33, 108, 201),
 (24, 33, 108, 307),
 (24, 60, 61, 25),
 (24, 60, 61, 50),
 (24, 60, 61, 60),
 (24, 60, 61, 372),
 (24, 60, 138, 24),
 (24, 60, 138, 51),
 (24, 60, 138, 204),
 (24, 60, 138, 260),
 (24, 60, 138, 306),
 (28, 29, 24, 33),
 (28, 29, 91, 112),
 (28, 29, 91, 113),
 (28, 29, 91, 127),
 (28, 29, 91, 366),
 (28, 29, 144, 159),
 (28, 29, 161, 96),
 (28, 29, 161, 126),
 (28, 29, 203, 34),
 (28, 29, 203, 56),
 (28, 425, 118, 24),
 (28, 425, 118, 34),
 (28, 425, 118, 51),
 (28, 425, 118, 56),
 (28, 425, 198, 34),
 (31, 30, 24, 25),
 (31, 30, 24, 50),
 (31, 30, 28, 29),
 (31, 30, 34, 158),
 (31, 30, 36, 51),
 (31, 30, 36, 110),
 (31, 30, 36, 357),
 (31, 

In [47]:
#check the correctness for positive
def check_correctness_3_positive(one_hop, s_t_r, x_p_train, x_r_train, y_train, pos_train, neg_train):

    total_error = 0
    num_pos = 0
    for i in range(900):

        Tuple = tuple(x_p_train[3][i])
        r_0 = x_r_train[3][i][0]
        l = y_train[3][i]

        if l == 1.0:
            num_pos += 1
            error = True
            for s in one_hop:
                if error == False:
                    break
                for r in one_hop[s]:
                    if error == False:
                        break
                    for t in one_hop[s][r]:

                        Set = set()
                        check_len_3(s, Tuple[0], Tuple[1], Tuple[2], t, one_hop, Set)

                        if 'true' in Set: #we found an (s,t) entity pair with the len-3 connection
                            
                            if r_0 in s_t_r[(s,t)]: #if r_0 between s,t
                                error = False
                                break
            if error == True:
                total_error += 1
            if i % 100 == 0:
                print('checking big batch correctness', i)
    
    return(total_error, num_pos)

total_error, num_pos = check_correctness_3_positive(one_hop, s_t_r, x_p_train, x_r_train, y_train, pos_record, neg_record)
print(total_error, num_pos)

checking big batch correctness 100
checking big batch correctness 200
checking big batch correctness 500
checking big batch correctness 700
0 457


In [48]:
#check the correctness for negative
temp_record = defaultdict(set)
def check_correctness_3_negative(temp_record, one_hop, s_t_r, x_p_train, x_r_train, y_train, pos_train, neg_train):

    total_error = 0
    num_neg = 0
    for i in range(0,900):

        Tuple = tuple(x_p_train[3][i])
        r_0 = x_r_train[3][i][0]
        l = y_train[3][i]

        if l == 0.0:
            num_neg += 1
            error = False
            for s in one_hop:
                if error == True:
                    break
                for r in one_hop[s]:
                    if error == True:
                        break
                    for t in one_hop[s][r]:

                        Set = set()
                        check_len_3(s, Tuple[0], Tuple[1], Tuple[2], t, one_hop, Set)

                        if 'true' in Set: #we found an (s,t) entity pair with the len-3 connection
                            
                            if r_0 in s_t_r[(s,t)]: #if r_0 between s,t
                                
                                error = True
                                temp_record[Tuple[0], Tuple[1], Tuple[2]].add(r_0)
                                break
            if error == True:
                total_error += 1

            if i % 100 == 0:
                print('checking big batch correctness', i)
    
    return(total_error, num_neg)

total_error, num_neg = check_correctness_3_negative(temp_record, one_hop, s_t_r, x_p_train, x_r_train, y_train, pos_record, neg_record)
print(total_error, num_neg)

checking big batch correctness 0
checking big batch correctness 300
checking big batch correctness 400
checking big batch correctness 600
checking big batch correctness 800
4 443


In [49]:
#check the correctness for negative
temp_record = defaultdict(set)
def check_correctness_3_negative(temp_record, one_hop, s_t_r, x_p_train, x_r_train, y_train, pos_train, neg_train):

    total_error = 0
    total_corre = 0
    num_neg = 0
    for i in range(0,900):

        Tuple = tuple(x_p_train[3][i])
        r_0 = x_r_train[3][i][0]
        l = y_train[3][i]

        if l == 0.0:
            num_neg += 1
            for s in one_hop:

                for r in one_hop[s]:

                    for t in one_hop[s][r]:

                        Set = set()
                        check_len_3(s, Tuple[0], Tuple[1], Tuple[2], t, one_hop, Set)

                        if 'true' in Set: #we found an (s,t) entity pair with the len-3 connection
                            
                            if r_0 in s_t_r[(s,t)]: #if r_0 between s,t
                                
                                total_error += 1
                                temp_record[Tuple[0], Tuple[1], Tuple[2]].add(r_0)
                            
                            else:
                                
                                total_corre += 1
                                

        if i % 100 == 0:
            print('checking big batch correctness', i)
    
    return(total_error, total_corre, num_neg)

total_error, total_corre, num_neg = check_correctness_3_negative(temp_record, one_hop, s_t_r, x_p_train, x_r_train, y_train, pos_record, neg_record)
print(total_error, total_corre, num_neg)

checking big batch correctness 0
checking big batch correctness 100
checking big batch correctness 200
checking big batch correctness 300
checking big batch correctness 400
checking big batch correctness 500
checking big batch correctness 600
checking big batch correctness 700
checking big batch correctness 800
72 28183 443


In [185]:
temp_record

defaultdict(set,
            {(1, 1, 3): {3, 6, 7},
             (1, 2, 3): {3},
             (2, 1, 1): {6},
             (1, 2, 2): {2},
             (3, 0, 1): {9},
             (1, 1, 0): {8, 9},
             (1, 0, 1): {8, 9},
             (0, 0, 3): {7},
             (1, 0, 2): {8},
             (2, 0, 0): {6, 7},
             (2, 1, 0): {6},
             (1, 0, 0): {8, 9},
             (0, 8, 3): {1, 8},
             (1, 1, 1): {9},
             (2, 0, 1): {6},
             (0, 1, 3): {6, 7},
             (10, 11, 2): {6},
             (0, 1, 0): {8, 9},
             (0, 0, 1): {8},
             (3, 1, 2): {2},
             (1, 1, 2): {8},
             (0, 1, 1): {9},
             (2, 0, 3): {3},
             (2, 3, 0): {2},
             (0, 8, 2): {8},
             (0, 0, 6): {3},
             (2, 6, 2): {7},
             (3, 1, 0): {9}})

In [186]:
total = 0
no_key = 0
no_value = 0
for key in temp_record:
    if key not in connect_dict:
        no_key += 1
    else:
        for value in temp_record[key]:
            if value not in connect_dict[key]:
                no_value += 1

In [187]:
no_key

0

In [188]:
no_value

37

In [241]:
connect_dict

defaultdict(set,
            {(2, 2, 3, 1): {0, 1, 3, 8},
             (0, 1, 2, 3): {2, 4, 5, 6, 8, 10, 11},
             (0, 2, 0, 2): {2, 3, 7},
             (0, 2, 1, 2): {0, 1, 2, 3, 7},
             (0, 2, 2, 3): {0, 1},
             (2, 1, 2, 3): {0, 1, 8, 12},
             (1, 2, 3, 3): {0, 1, 2, 9},
             (2, 0, 2, 3): {0, 1, 8, 12},
             (0, 1, 0): {0, 1, 2, 3, 6, 7},
             (0, 1, 1): {0, 1, 2, 3, 6, 7, 8},
             (0, 1, 2): {0, 1, 2, 3, 4, 6, 7, 10, 11},
             (1, 2, 2, 1): {0, 1, 2, 3, 5, 7},
             (1, 2, 1, 7): {2, 3, 6, 7},
             (3, 0, 0): {0, 1, 2, 3, 5, 6, 7, 10, 11},
             (3, 0, 1): {0, 1, 2, 3, 5, 6, 7, 10, 11},
             (3, 1, 0): {0, 1, 2, 3, 5, 6, 7, 10, 11},
             (3, 6): {3},
             (3, 6, 0, 3): {1},
             (3, 1, 1): {0, 1, 2, 3, 5, 6, 7, 10, 11},
             (3, 7): {2, 3, 6, 7},
             (3, 1, 0, 2): {0, 1, 2, 3, 4, 5, 10, 11},
             (3, 0, 0, 2): {0, 1, 2, 3, 4, 5, 

In [None]:
#check the correctness of big-batch building for validation
x_p_valid, x_r_valid, y_valid, len_valid = build_big_batches(
holder_len_valid, path_l_bd, Class_2, one_hop_valid, s_t_r_valid, 
    relation2id, entity2id,
    'filling the validation data', entire_epoch)

### Start Training: load the KG and call classes

Here, we use the validation set to see the training efficiency. That is, we use the validation to check whether the true relation between entities can be predicted by paths.

The trick is: in validation, we have to use the same relation ID and entity ID as in the training. But we don't want to use the links in training anymore. That is, in validation, we want to use (and update if necessary) entity2id, id2entity, relation2id and id2relation. But we want to use new one_hop, data, data_ and s_t_r for validation set. Then, path-finding will also be based on new one_hop.


In [82]:
#define the dictionaries and sets for load KG
one_hop = dict() 
data = set()
data_ini = set()
entity2id = dict()
id2entity = dict()
relation2id = dict()
id2relation = dict()
s_t_r = defaultdict(set)

#read the training KG triples
Class_1 = LoadKG()
Class_2 = ObtainPathsByDynamicProgramming()

#fill in the sets and dicts
Class_1.load_train_data('../train.txt', one_hop, data, data_ini, 
                        entity2id, id2entity, relation2id, id2relation, s_t_r)

one_hop_valid = dict() 
data_valid = set()
data_ini_valid = set()
s_t_r_valid = defaultdict(set)

len_0 = len(relation2id)
size_0 = len(entity2id)

Class_1.load_train_data('../valid.txt', one_hop_valid, data_valid, data_ini_valid, 
                        entity2id, id2entity, relation2id, id2relation, s_t_r_valid)

len_1 = len(relation2id)
size_1 = len(entity2id)

if len_0 != len_1:
    raise ValueError('unseen relation!')

In [83]:
holder_len_train = 100
holder_len_valid = 20
path_l_bd = 10
entire_epochs = 10

for entire_epoch in range(entire_epochs):
    
    #######################################
    ###build the train big-batches#########
    
    #fill in the training array list
    x_p_train, x_r_train, y_train, len_train = build_big_batches(
    holder_len_train, path_l_bd, Class_2, one_hop, s_t_r, 
        relation2id, entity2id, 
        'filling the training data', entire_epoch)

    
    ########################################
    ###build the validation big-batches#####
        
    #fill in the training array list
    x_p_valid, x_r_valid, y_valid, len_valid = build_big_batches(
    holder_len_valid, path_l_bd, Class_2, one_hop_valid, s_t_r_valid, 
        relation2id_valid, entity2id_valid,
        'filling the validation data', entire_epoch)
   

    #######################################
    ###do the training#####################
    for path_len in len_train:
        
        print('training on length', path_len, path_l_bd, 'for epoch', epochs)
        
        model.fit([x_p_train[path_len], x_r_train[path_len]], y_train[path_len], 
                  validation_data=([x_p_valid[path_len], x_r_valid[path_len]], 
                                    y_valid[path_len]),
                  batch_size=16, epochs=10)

KeyError: 8710

In [None]:
    #sometimes in validation, path of a specific length may be very lack
    #we will detect this situation be comparing the total performance times to the filled amount of a length
    #if one length of big-batch are still not full after a threshould, we give it up.
    #Then, we record the abandoned big-batchs,
    #so that they are skiped, in training or testing (validation)
    fill_threshold = 200*holder_len*path_l_bd
    fill_count = 0
    abandon_len = {length: False for length in range(2, path_l_bd+1)}
    
    carry_on = True

In [None]:
    ########################################
    ###build the validation big-batches#####
    
    #load the knowledge graph
#    Class = LoadKG('../valid.txt')
#    one_hop, data, entity_dict, inverse_entity_dict, relation_dict, inverse_relation_dict, s_t_r = Class.load_train_data()

    #call the path-finding class function
#    Class_2 = ObtainPathsByDynamicProgramming(one_hop, data, entity_dict, inverse_entity_dict, relation_dict, inverse_relation_dict)
    
    #fill in the training array list
#    x_p_valid, x_r_valid, y_valid, len_valid = build_big_batches(
#    holder_len_valid, path_l_bd, Class_2, s_t_r, relation_dict, 'filling the validation data', epoch)
    
#    del(Class, Class_2, one_hop, data, entity_dict, inverse_entity_dict, relation_dict, inverse_relation_dict, s_t_r)


In [96]:
source_entity

14071

In [75]:
len_record = {length: 0 for length in range(2, path_l_bd+1)}
len_record

{2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}

In [None]:
    for length in range(2, path_l_bd + 1):
        
        #build the big-batch
        x_holder = np.zeros((holder_len, length))
        y_holder = np.zeros((holder_len, 2))
        
        #every half batch hold the true samples, while the other half hold negative samples.
        for i in range(holder_len):
            
            remain = i % batch_size
            
            if remain < half_batch_size:
                
            else:

In [52]:
test = np.zeros((10,1))
test[2] = 5
test

array([[0.],
       [0.],
       [5.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [43]:
temp = random.choice([1,2,3,4,5])
print(temp)

3


In [236]:
test = {1:10, 2:15, 3:5}
test_2 = {1:10, 2:15, 3:5}
print(sum(test.values()))

30


In [237]:
del(test, test_2)

In [None]:
list_a = [[10,12],[3,5],[6,7]]
list_b = [[3],[5],[6]]

input_a = np.array(list_a)
input_b = np.array(list_b)

pred = model.predict([input_a, input_b])

for i in range(pred.shape[0]):
    
    print(pred[i][0])

In [26]:
test_1 = {1: 0, 2:0, 3:0}
test_2 = {key for key in test_1}
type(test_2)

set

In [None]:
holder_len_train = 10000
holder_len_valid = 1000
lower_bd = 2
upper_bd = 10
entire_epochs = 10

for entire_epoch in range(entire_epochs):
    
    #######################################
    ###build the train big-batches#########
    
    #fill in the training array list
    x_p_train, x_r_train, y_train, len_train = build_big_batches(
    holder_len_train, lower_bd, upper_bd, Class_2, one_hop, s_t_r, 
        relation2id, entity2id, 
        'filling the training data', entire_epoch)

    
    ########################################
    ###build the validation big-batches#####
        
    #fill in the training array list
    x_p_valid, x_r_valid, y_valid, len_valid = build_big_batches(
    holder_len_valid, lower_bd, upper_bd, Class_2, one_hop_valid, s_t_r_valid, 
        relation2id, entity2id,
        'filling the validation data', entire_epoch)
   

    #######################################
    ###do the training#####################
    for path_len in len_train:
        
        if (len_train[path_len] == holder_len_train) and (
            len_valid[path_len] == holder_len_valid):
            
            print('training on length', path_len, upper_bd, 'for epoch', entire_epoch)
        
            model.fit([x_p_train[path_len], x_r_train[path_len]], y_train[path_len], 
                      validation_data=([x_p_valid[path_len], x_r_valid[path_len]], 
                                        y_valid[path_len]),
                      batch_size=16, epochs=2)