In [11]:
import sys
sys.path.append('../code')

from settings import VECTOR_SIZE,  CLASSIFIER_EPOCHS
from utils_graph import parse_rdflib_to_torch

In [2]:
from gensim.models import Word2Vec
word_vectors = Word2Vec.load('walks/model').wv


In [3]:
from pathlib import Path
import pandas as pd
from models import ClassifierSimple
import torch
model = ClassifierSimple()    

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if Path('rdf2vecClassfier.pth').is_file():
    print('found trained model! Loading :)')
    model.load_state_dict(torch.load('rdf2vecClassfier.pth'))
    history = pd.read_csv('log.csv')
    model = model.to(device)
else:
    print('model not found. Train it with ''train_rdf2vec_classifier.ipynb')

  from .autonotebook import tqdm as notebook_tqdm


found trained model! Loading :)


In [4]:
from rdflib import Graph, URIRef
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
from utils_graph import clean_graph, get_entities


g_train = Graph()
g_val = Graph()
g_test = Graph()

g_train = g_train.parse('FB15k-237/train.nt', format='nt')
g_val   = g_val.parse('FB15k-237/valid.nt', format='nt')
g_test  = g_test.parse('FB15k-237/test.nt', format='nt')


# clean graphs 
# number of triples removed should be low, a few hundred
print(f"removed {clean_graph(g_train,word_vectors)} triples from training set")
print(f"removed {clean_graph(g_val,word_vectors)} triples from validation set")
print(f"removed {clean_graph(g_test,word_vectors)} triples from test set")

entities = get_entities((g_train,g_val,g_test))

removed 270 triples from training set
removed 35 triples from validation set
removed 61 triples from test set


In [5]:
from utils_eval import eval_ranks, mean_rank, mean_reciprocal_rank, hitsAt

In [6]:
import pandas as pd

In [7]:
# tested scoring on cpu
import gc
from utils_eval import eval_model
torch.set_num_threads(32)
   

stats, pt = eval_model(model,g_train,lambda x: word_vectors[x])

for k,v in stats.items():
    print(f"{k}\t{v}")
    
pd.DataFrame(stats).to_csv('rdf2vec_train_scores.csv')
pt.stats_sum()

100%|██████████| 237/237 [1:07:31<00:00, 17.09s/it]


MR	[100.34046936035156]
MRR	[0.6825782656669617]
Hits@1	[0.28289833664894104]
Hits@3	[0.4160054326057434]
Hits@10	[0.5686512589454651]
MR_head	[67.42181396484375]
MRR_head	[0.7664385437965393]
Hits@1_head	[0.3181592524051666]
Hits@3_head	[0.4730563461780548]
Hits@10_head	[0.633588969707489]
MR_tail	[133.25912475585938]
MRR_tail	[0.598717987537384]
Hits@1_tail	[0.24763743579387665]
Hits@3_tail	[0.3589545488357544]
Hits@10_tail	[0.5037134885787964]


{'preprocessing': 102.14475320791826,
 'subgraph': 1.1769551234319806,
 'collect_embeddings': 0.02276387019082904,
 'copy embeddings into array': 720.8355533811264,
 'score_embeddings': 3226.3337846915238,
 'rank_embeddings': 103.31181812100112}

In [8]:
stats, pt = eval_model(model,g_val,lambda x: word_vectors[x])

for k,v in stats.items():
    print(f"{k}\t{v}")
pt.stats_sum()

pd.DataFrame(stats).to_csv('rdf2vec_val_scores.csv')

100%|██████████| 223/223 [12:56<00:00,  3.48s/it]


MR	[329.9128723144531]
MRR	[0.33454519510269165]
Hits@1	[0.1241428554058075]
Hits@3	[0.21882857382297516]
Hits@10	[0.3336285650730133]
MR_head	[185.71273803710938]
MRR_head	[0.43780481815338135]
Hits@1_head	[0.1638857126235962]
Hits@3_head	[0.29280000925064087]
Hits@10_head	[0.4277714192867279]
MR_tail	[474.1130065917969]
MRR_tail	[0.23128560185432434]
Hits@1_tail	[0.0843999981880188]
Hits@3_tail	[0.14485713839530945]
Hits@10_tail	[0.2394857108592987]


In [9]:
stats, pt = eval_model(model,g_test,lambda x: word_vectors[x])

for k,v in stats.items():
    print(f"{k}\t{v}")
pt.stats_sum()
pd.DataFrame(stats).to_csv('rdf2vec_test_scores.csv')

100%|██████████| 224/224 [13:52<00:00,  3.72s/it]


MR	[344.34423828125]
MRR	[0.339802086353302]
Hits@1	[0.12793433666229248]
Hits@3	[0.2197255641222]
Hits@10	[0.3298211097717285]
MR_head	[196.85157775878906]
MRR_head	[0.46707841753959656]
Hits@1_head	[0.18142612278461456]
Hits@3_head	[0.3021318316459656]
Hits@10_head	[0.42822837829589844]
MR_tail	[491.8369140625]
MRR_tail	[0.21252577006816864]
Hits@1_tail	[0.0744425356388092]
Hits@3_tail	[0.13731928169727325]
Hits@10_tail	[0.23141387104988098]


In [10]:
stats

{'MR': [344.34423828125],
 'MRR': [0.339802086353302],
 'Hits@1': [0.12793433666229248],
 'Hits@3': [0.2197255641222],
 'Hits@10': [0.3298211097717285],
 'MR_head': [196.85157775878906],
 'MRR_head': [0.46707841753959656],
 'Hits@1_head': [0.18142612278461456],
 'Hits@3_head': [0.3021318316459656],
 'Hits@10_head': [0.42822837829589844],
 'MR_tail': [491.8369140625],
 'MRR_tail': [0.21252577006816864],
 'Hits@1_tail': [0.0744425356388092],
 'Hits@3_tail': [0.13731928169727325],
 'Hits@10_tail': [0.23141387104988098]}