## Download and check rdf2vec

In [1]:
VECTOR_SIZE=100

## Download Dataset


In [2]:
# download .nt dataset from my drive
! wget -q -nc --no-check-certificate 'https://docs.google.com/uc?export=download&id=1pBnn8bjI2VkVvBR33DnvpeyocfDhMCFA' -O fb15k-237_nt.zip

! unzip -n fb15k-237_nt.zip

/bin/bash: /home/otautz/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
wget: /home/otautz/miniconda3/lib/libuuid.so.1: no version information available (required by wget)
/bin/bash: /home/otautz/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Archive:  fb15k-237_nt.zip


In [3]:
import numpy as np
import gc

def get_entities(graphs):
    # get subjects and objects
    entities = []
    
    for g in graphs:
        entities = entities + list(g.subjects(unique=True)) + list(g.objects(unique=True))

    # pythons stupid version of nub
    entities = list(dict.fromkeys(entities))
    return entities

def get_all_corrupted_triples_fast(triple,entities,position = 'object'):
    # not faster ...

    s,p,o = triple

    object_augmented = [(x,y,z) for  (x,y), z in itertools.product([triple[0:2]],entities)]
    subject_augmented =[(x,y,z) for  x, (y,z) in itertools.product(entities,[triple[1:3]])]
    
    
    return itertools.chain(object_augmented , subject_augmented)

def get_all_corrupted_triples(triple,entities):
    #too slow ....
    
    s,p,o = triple
    subject_corrupted = [(s_corr,p,o) for s_corr in entities if s_corr != s]
    object_corrupted = [(s,p,o_corr)   for o_corr in entities if o_corr != o]

    return subject_corrupted + object_corrupted


    

def choose_many_multiple(arrs,n):
    l = len(arrs[0])
    for a in arrs:
        assert len(a) == l, 'Arres not of same length ! :('
        
    
    ix = np.random.choice(range(len(a)),n)
    
    return [np.array(a)[ix] for a in arrs]
    
def choose_many(a,n):
    ix = np.random.choice(range(len(a)),n)
    return np.array(a)[ix]
    
def choose(a):

    L = len(a)

    i = np.random.randint(0,L)

    return a[i]

def get_random_corrupted_triple(triple,entities, corrupt='object'):
    """
    corrupt = one of 'subject', 'object', 'both'
    
    return corrupted triple with random entity
    """

    s,p,o = triple
    
    # set up as the same
    s_corr = s
    o_corr = o
    
    if corrupt == 'subject':  
        # corrupt only the subject
        while s_corr == s:
            s_corr = choose(entities)  
    elif corrupt == 'object':
        # corrupt only the object
        while o_corr == o:
            o_corr = choose(entities)  
    elif corrupt == 'random':
        # corrupt one or both randomly
        ch = np.random.randint(3)
        
        if ch == 0:
            while s_corr == s:
                s_corr = choose(entities)  
        if ch == 1 :
            while o_corr == o:
                o_corr = choose(entities)  
        if ch == 2:
            while s_corr == s or o_corr == o:
                s_corr = choose(entities)  
                o_corr = choose(entities) 
    else:
        while s_corr == s or o_corr == o:
            s_corr = choose(entities)  
            o_corr = choose(entities) 
            
    
    return (s_corr,p,o_corr)
    
def merge_historires(history_list):
    h = {}
    for key in history_list[0].history.keys():
        h[key] = [h.history[key][0] for h in histories]
    return h    


def clean_graph(graph,wv):
    """
    clean graph such that all triples have word vectors present in wv
    
    """
    no_removed = 0 
    for t in graph:
        s,p,o = t
        if not str(s) in wv.key_to_index.keys() or not str(p) in wv.key_to_index.keys() or not str(o) in wv.key_to_index.keys():
            graph.remove(t)
            no_removed+=1
    return no_removed
    
    
def get_vectors_fast(triples,entity_vec_mapping,vector_size=VECTOR_SIZE):
    # ~20-30% faster
    X = np.array(triples)
    X = word_vectors[X.flatten()].reshape(len(triples),vector_size*3)
    
    return X    

def get_vectors(triples,entity_vec_mapping,vector_size=200):
    X = np.array(triples)
    X = [(entity_vec_mapping(x[0]), entity_vec_mapping(x[1]),entity_vec_mapping(x[2])) for x in X]
    X = [np.concatenate(x) for x in X]
    X = np.vstack(X).astype(np.float64)
    
    return X

def get_1_1_dataset(graph, entities,entity_vec_mapping,corrupt='random'):
    
    original_triple_len = len(graph)
    # get triples
    X = list(graph)
    no_t = len(X)
    

    
    corrupted_triples = [get_random_corrupted_triple(x,entities,corrupt=corrupt) for x in X]
    X = X + corrupted_triples
    
    

    # convert uris to strings
    
    X = get_vectors_fast(X,entity_vec_mapping)
    
    # stack them

    Y = np.concatenate((np.ones(no_t),np.zeros(no_t))).astype(np.uint8)
    
    return X, Y

def test_sklearn_model(model,X,Y,x_test,y_test,subset=10000):
    

  
    
    ix = np.random.choice(range(len(X)),size=subset)
    
    scaler = preprocessing.StandardScaler().fit(X)
    
    X_scaled = scaler.transform(X[ix])
    model.fit(X_scaled,Y[ix])

    print(f'train_score ={model.score(scaler.transform(X),Y)}')    
    print(f'test_score ={model.score(scaler.transform(x_test),y_test)}')

def scale_and_predict(model,x):
    x = preprocessing.StandardScaler().fit_transform(x)
    return model.predict(x)

## Parse Graph

In [37]:
from rdflib import Graph, URIRef
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph




g_train = Graph()
g_val = Graph()
g_test = Graph()

g_train = g_train.parse('FB15k-237/train.nt', format='nt')
g_val   = g_val.parse('FB15k-237/valid.nt', format='nt')
g_test  = g_test.parse('FB15k-237/test.nt', format='nt')


In [38]:
from gensim.models import Word2Vec


word_vectors = Word2Vec.load('walks/model').wv

In [39]:
def map_keyed_vectors(word_vectors, iterable):
    """
    for some reason faster than native call :O
    """
    return np.array(list(word_vectors.get_vector(x) for x in iterable))

In [40]:
# pytorch model
import torchmetrics
import torch
from torch import nn 


class ClassifierSimple(torch.nn.Module):
    def __init__(self,input_dim=300,hidden_size=64):
        super(ClassifierSimple, self).__init__()
        
        
        self.layers = nn.Sequential(
                # flatten input if necessary
                nn.Flatten(),
                nn.Linear(input_dim,hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size,1)
        )
        
        self.output_activation = nn.Sigmoid()
                
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
        
    
    def forward(self,x):        
        
        return self.layers(x)
    def predict(self,x):
        x.to(self.device)
        
        return self.output_activation(self.layers(x))
    def predict_numpy(self,x):
        x = torch.tensor(x)
        x.to(self.device)
        return self.output_activation(self.layers(x)).detach().cpu().numpy()
        
    


In [41]:
from pathlib import Path
import pandas as pd
model = ClassifierSimple()    

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if Path('rdf2vecClassfier.pth').is_file():
    print('found trained model! Loading :)')
    model.load_state_dict(torch.load('rdf2vecClassfier.pth'))
    history = pd.read_csv('log.csv')
    model = model.to(device)
else:
    print('model not found. Train it with ''train_rdf2vec_classifier.ipynb')

found trained model! Loading :)


In [42]:
import time
from tqdm import tqdm
from collections import defaultdict
from itertools import chain
import pickle


In [43]:
def fast_concat(se,pe,oe):
    
    assert se.shape == pe.shape, "Error! fast_concat with differing shapes"
    assert se.shape == oe.shape, "Error! fast_concat with differing shapes"
    
    
    x = np.empty((se.shape[0],se.shape[1]*3),dtype=np.float32)
    x[:,0:100] =se
    x[:,100:200] = pe
    x[:,200:] = oe
    
    return x

In [44]:
def get_all_s_o_tuples(entities):
    return [(s,o) for s in entities for o in entities] + [(o,s) for s in entities for o in entities]
    
    

In [45]:
def test_if_row_in_array(row,arr):
    return np.any(np.sum(arr == row,axis=1) == 3)

In [46]:
def get_filter(triples,graphs):
    # Too slow ... parallelize?! What else? Faster lookup? How?
    graphs = [np.array(g) for g in graphs]

    graphs = np.concatenate(graphs,axis=0)
    
    known_ix = []
    
    for i,tp in enumerate(triples):
        if test_if_row_in_array(tp,graphs):
            known_ix.append(i)
    return known_ix
    
    

In [47]:
def map_keyed_vectors(word_vectors, iterable):
    """
    for some reason faster than native call :O
    """
    return np.array(list(word_vectors.get_vector(x) for x in iterable))

In [48]:
import time
from sklearn.utils.extmath import cartesian

def evaluate_link_pred_fast(score_f,graph,entity_vec_mapping,entities,vector_size = 100, max_triples=100, plot = False, filter_by=None,verbose = True):
    
    
    
    stats = {'preprocessing_time' : -1,
        'embeddings_time': [],
        'rank_time': [],
        'find_rank_time':[],
        'ranks':[]
        }
    
    start_timer = time.perf_counter()
    predicates = np.array(list(set(graph.predicates())))
    
    graph = np.array(graph)
    
    
    print(f"evaluate LP on graph with {len(graph)} triples, {len(entities)} entities and {len(predicates)} predicates!")
    
    print(f"Starting preprocessing")
    embeddings_scores = defaultdict(lambda: defaultdict(lambda :None))
    
    entity_array = np.array(entities)
    entity_mapping = entity_vec_mapping(np.array(entity_array))
    
    no_entitites= len(entities)
    ix = list(range(no_entitites))
    
    s_o_combinations = cartesian((ix,ix))   

    subjects_ix = s_o_combinations[:,0] # sorted(s_o_combinations[:,0])
    subject_embeddings = entity_mapping[subjects_ix]
                                 

    
    objects_ix = s_o_combinations[:,1]  #sorted(s_o_combinations[:,1])
    object_embeddings = entity_mapping[objects_ix]
    
                           
    no_triples_per_predicate=len(subjects_ix)
    
    
    
    preprocessing_timer = time.perf_counter()
    
    stats['preprocessing_time'] = preprocessing_timer - start_timer
    
    print(f"Finished preprocessing")
    
    # del is very slow :(
    # del s_o_combinations
    s_o_combinations = None
    #gc.collect()
    
    
    for p in tqdm(predicates):
        predicate_start_timer = time.perf_counter()
        
        
             
        predicate_embedding = entity_vec_mapping([p])
        predicate_column = np.repeat(predicate_embedding,no_triples_per_predicate).reshape(no_triples_per_predicate,vector_size)
        
        #return subject_embeddings,predicate_column,object_embeddings

        triple_embeddings = fast_concat(subject_embeddings,predicate_column,object_embeddings)
        
        
        

        predicate_embeddings_timer = time.perf_counter()
        
        #del predicate_column
        predicate_column = None

        

        scores = np.squeeze(score_f(triple_embeddings))
        
        #del triple_embeddings
        triple_embeddings = None
        #gc.collect()
        
        
        
        sorted_ix = np.flip(np.argsort(scores))     
        
        subjects = entity_array[subjects_ix] 
        objects = entity_array[objects_ix]
    
        scored_triples = np.stack([subjects,np.repeat(p,no_triples_per_predicate),objects]).T   
        scored_triples = scored_triples[sorted_ix]
        
        
        predicate_rank_timer = time.perf_counter()
        
        ranks = []
    

        predicate_subgraph = graph[graph[:,1] == p]
    
    
        
        for triple in predicate_subgraph:
            try:
                rank = np.where(np.sum(scored_triples ==triple,axis=1) == 3)[0][0]
                print(rank)
                ranks.append(rank)
            except:
                print(triple)
                print(np.where(np.sum(scored_triples ==triple,axis=1)))
                print('unknown entity or relation!')
                
        
        
        predicate_find_rank = time.perf_counter()
        
            
        stats['embeddings_time'].append(predicate_embeddings_timer-predicate_start_timer)
        stats['rank_time'].append(predicate_rank_timer - predicate_embeddings_timer)
        stats['find_rank_time'].append(predicate_find_rank - predicate_rank_timer)
        stats['ranks'].extend(ranks)
        
       
        

    return  stats

In [63]:
from rdflib import Graph, URIRef
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph



g_train = Graph()
g_val = Graph()
g_test = Graph()

g_train = g_train.parse('FB15k-237/train.nt', format='nt')
g_val   = g_val.parse('FB15k-237/valid.nt', format='nt')
g_test  = g_test.parse('FB15k-237/test.nt', format='nt')


# clean graphs 
# number of triples removed should be low, a few hundred
print(f"removed {clean_graph(g_train,word_vectors)} triples from training set")
print(f"removed {clean_graph(g_val,word_vectors)} triples from validation set")
print(f"removed {clean_graph(g_test,word_vectors)} triples from test set")

entities = get_entities((g_train,g_val,g_test))

removed 270 triples from training set
removed 35 triples from validation set
removed 61 triples from test set


In [64]:
def compute_rank(scores,ix,mask=None):
    if mask == None:    
        optimistic_rank =(scores > scores[ix]).sum()+1
        pessimistic_rank = (scores >= scores[ix]).sum()

        
    else:
        
        optimistic_rank = ((scores > scores[ix]).index_fill(0,mask,False)).sum()+1
        pessimistic_rank = ((scores >= scores[ix]).index_fill(0,mask,False)).sum()
        
    rank = (optimistic_rank+pessimistic_rank)*0.5
        
    return rank

def parse_rdflib_to_torch(graph):
    entities = get_entities([graph])
    
    entities=np.array(entities)
    entity_vecs= torch.tensor(word_vectors[np.array(entities)])
    entities = dict(zip(entities,range(len(entities))))
    
    predicates = np.array(list(set(graph.predicates())))
    predicate_vecs = torch.tensor(word_vectors[predicates])
    predicates = dict(zip(predicates,range(len(predicates))))
    
    edges = []
    predicate_ix = []
    for s,p,o in np.array(graph):
        try:
            edge = (entities[s],entities[o])
            edges.append(edge)
            predicate_ix.append(predicates[p])
        except:
            print(f"Unknown entities encountered! ({s},{o})")

    edges = torch.tensor(edges)
    predicate_ix = torch.tensor(predicate_ix)
    
    return edges, predicate_ix, entities, predicates,entity_vecs,predicate_vecs

def get_comb_ix(edge,no_entities):
    return edge[0]*no_entities+edge[1]

def create_mask(length,ix,reverse=False):
    if not reverse:
        mask = torch.ones(length)
        mask = mask.index_fill(0,ix,0)
    else:
        mask = torch.zeros(length)
        mask = mask.index_fill(0,ix,1)
    
    
    return mask

In [65]:
from timer import PerfTimer
from torch.utils.data import DataLoader

def eval_ranks(model,graph,filtered = True, batchsize=None, vecsize=100,force_cpu=False):
# get data from graph
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if force_cpu:
        device = torch.device('cpu')
    
    model = model.to(device)
    
    perfTimer = PerfTimer()
    
    perfTimer.start()
    
    edges, predicate_ix, entity_ix_mapping, predicate_ix_mapping,entity_vecs,predicate_vecs = parse_rdflib_to_torch(graph)
    
    ranks = []
    ix_predicate_mapping = {k:v for (v,k) in predicate_ix_mapping.items()}
    ix_entity_mapping = {k:v for (v,k) in entity_ix_mapping.items()}
    
    no_entities  = len(entity_ix_mapping.keys())
    
    perfTimer.track('preprocessing')    
    
    for pred_ix in tqdm(ix_predicate_mapping.keys()):
        

        current_edges = edges[predicate_ix == pred_ix]
        number_of_real_edges = len(current_edges)
        
        perfTimer.track('subgraph')
        # could also try torch.cross and torch.combinations if needed
        # all possible s_o_combinations
        s_o_combs = torch.tensor(cartesian((range(no_entities),range(no_entities))))
    


        edge_ix = torch.tensor([get_comb_ix(x,no_entities) for x in current_edges])
        perfTimer.track('edges')
        
        if batchsize:
            dl = DataLoader(s_o_combs, batch_size=batchsize, shuffle=False)
            perfTimer.track('dl')
            predicate_embedding = predicate_vecs[pred_ix]
            #use expand as memory of rows is shared. This may cause bugs ... investigate!
            predicate_embedding = predicate_embedding.reshape(1,vecsize).expand(batchsize,vecsize)
            perfTimer.track('expand_pediacte')
            
            scores = []

            for batch in dl:
                                #print(s_o_combs[:,0].type())
                #print(batch[:,0].type())
                subject_embeddings = entity_vecs[batch[:,0]]
                object_embeddings = entity_vecs[batch[:,1]]
                perfTimer.track('collect_embeddings_batch')
                
                if len(batch) != batchsize:
                    predicate_embedding = predicate_vecs[pred_ix]
                    predicate_embedding = predicate_embedding.reshape(1,vecsize).expand(len(batch),vecsize)
                
                to_score_embeddings = torch.hstack((subject_embeddings,predicate_embedding,object_embeddings))
                perfTimer.track('stack_batch')
                to_score_embeddings = to_score_embeddings.to(device)
                perfTimer.track('to_device')
                batch_scores = model(to_score_embeddings)
                perfTimer.track('predict_batch')
                scores.append(batch_scores)
                
            scores = torch.vstack(scores).squeeze()
            perfTimer.track('stack_all')
            
        else:
            
            subject_embeddings = entity_vecs[s_o_combs[:,0]]
            object_embeddings = entity_vecs[s_o_combs[:,1]]

            predicate_embedding = predicate_vecs[pred_ix]
            #use expand as memory of rows is shared. This may cause bugs ... investigate!
            predicate_embedding = predicate_embedding.reshape(1,100).expand(len(subject_embeddings),100)
            
            perfTimer.track('collect_embeddings')

            to_score_embeddings = torch.hstack((subject_embeddings,predicate_embedding,object_embeddings))
            perfTimer.track('stack_embeddings')

            
            to_score_embeddings = to_score_embeddings.to(device)
            perfTimer.track('to_device')
            scores = model(to_score_embeddings).squeeze()
            perfTimer.track('score_embeddings')
            
        #sorted_ix = scor

        

        
        for ix in edge_ix:
            ranks.append(compute_rank(scores,ix,edge_ix.to(device)))
        perfTimer.track('rank_embeddings')
        
    return torch.tensor(ranks), perfTimer

    

In [76]:
import gc
gc.collect()

2441

In [70]:
231.26/35.76

6.467002237136465

In [71]:
len(entities)**2/10

20692822.5

In [77]:
with torch.no_grad():
    ranks, pt = eval_ranks(model,g_train,batchsize=20692822,force_cpu = True)
    print(ranks.mean())
    print((1/ranks).mean())
gc.collect()
#pt.stats()

  0%|          | 1/237 [59:00<232:07:35, 3540.91s/it]


KeyboardInterrupt: 

In [None]:
# tested on cpu. full graph, batched.
{key: sum(value) for key,value in pt.stats().items()}

In [61]:
# tested on cpu
{key: sum(value) for key,value in pt.stats().items()}

{'preprocessing': 0.054919965798035264,
 'subgraph': 0.6861647220794111,
 'edges': 15.424857352627441,
 'collect_embeddings': 22.007430262397975,
 'stack_embeddings': 43.36002808366902,
 'to_device': 0.026266970904543996,
 'score_embeddings': 50.359028440434486,
 'rank_embeddings': 184.76945111202076}