Preparing an environment, importing required libs

In [1]:
!pip install "tensorflow-gpu>=1.15.2,<2.0" ampligraph

Collecting numpy<1.19.0,>=1.16.0
  Using cached numpy-1.18.5-cp37-cp37m-manylinux1_x86_64.whl (20.1 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.6
[31mERROR: Cannot uninstall numpy 1.21.6, RECORD file not found. You might be able to recover from this via: 'pip install --force-reinstall --no-deps numpy==1.21.6'.[0m
You should consider upgrading via the '/home/dell/f1-knowledge-base/F1-knowledge-base/graph-embeddings/venv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import ampligraph

print(tf.version.VERSION)
ampligraph.__version__

1.15.5


'1.4.0'

At first, we have to create pandas dataframe from triples extracted from ontology populated with individuals


In [3]:
from rdflib import Graph, URIRef


ONTOLOGY_IRI = "https://github.com/RogoGit/F1-knowledge-base/f1-ontology"
ONTOLOGY_PREFIX = "f1"
POPULATED_ONTOLOGY_PATH  = '../ontology-with-individuals.owl'


f1_graph = Graph().parse(POPULATED_ONTOLOGY_PATH, format="turtle")
triples_list = []

for subject, predicate, triple_object in f1_graph.triples((None, None, None)):
    if predicate.startswith(URIRef(ONTOLOGY_IRI)):
        triples_list.append([ent.replace(ONTOLOGY_IRI + "#", ONTOLOGY_PREFIX + ":") for ent in [subject, predicate, triple_object]])

f1_df = pd.DataFrame(triples_list, columns = ['Subject', 'Predicate', 'Object'])
print(f1_df)


                                                  Subject  \
0       f1:qualifying_result_1994_san_marino_grand_pri...   
1          f1:race_result_2010_british_grand_prix_rosberg   
2                                    f1:driver_gene_force   
3          f1:race_result_1982_italian_grand_prix_angelis   
4            f1:race_result_2004_british_grand_prix_klien   
...                                                   ...   
362467      f1:race_result_2020_turkish_grand_prix_norris   
362468         f1:race_result_1974_monaco_grand_prix_ickx   
362469     f1:race_result_2019_french_grand_prix_hamilton   
362470      f1:race_result_1971_canadian_grand_prix_galli   
362471    f1:race_result_1974_canadian_grand_prix_wietzes   

                            Predicate                              Object  
0                   f1:driverPosition                                  13  
1                             f1:grid                                   5  
2          f1:hasDriverStandingResult  

Next step is to create train and test samples for graph embedding training

In [4]:
from ampligraph.evaluation import train_test_split_no_unseen 

X_train, X_test = train_test_split_no_unseen(np.array(triples_list), test_size=0.10, seed=0)

print('Train set size: ', X_train.shape)
print('Test set size: ', X_test.shape)

Train set size:  (326225, 3)
Test set size:  (36247, 3)


Now it is time to define ComplEx model and train model with train sample

In [6]:
import tensorflow.contrib
from ampligraph.latent_features import ComplEx, save_model

model = ComplEx(batches_count=100,
                epochs=300,
                k=100,
                eta=20,
                optimizer='adam',
                optimizer_params={'lr':1e-4},
                loss='multiclass_nll',
                regularizer='LP',
                regularizer_params={'p':3, 'lambda':1e-5},
                seed=0,
                verbose=True)

model.fit(X_train)
save_model(model, './embedding_model.pkl')

Average ComplEx Loss:   0.110975: 100%|██████████| 300/300 [3:23:58<00:00, 40.79s/epoch]  


In order to use already trained model we can run:

In [5]:
from ampligraph.latent_features import restore_model

model = restore_model('./embedding_model.pkl')

Next step is to ensure the model can be trained and evaluated correctly. The first of these is defining the filter that will be used to ensure that no negative statements generated by the corruption procedure are actually positives.

In [None]:
from ampligraph.evaluation import evaluate_performance

filter_triples = np.concatenate((X_train, X_test))
ranks = evaluate_performance(X_test,
                             model=model,
                             filter_triples=filter_triples,
                             use_default_protocol=True,
                             verbose=True)

Now let's use mrr_score (mean reciprocal rank) and hits_at_n_score functions

In [7]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mr = mr_score(ranks)
mrr = mrr_score(ranks)

print("MRR: %.2f" % mrr)
print("MR: %.2f" % mr)

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % hits_10)
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % hits_3)
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % hits_1)

MRR: 0.83
MR: 388.64
Hits@10: 0.93
Hits@3: 0.88
Hits@1: 0.76
