In [1]:
import pandas as pd
import rdflib

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

from pprint import pprint

SEED = 42

# Note: this is needed for reproducibility. Makes the 'random' processes within this notebook deterministic
%env PYTHONHASHSEED=$SEED

env: PYTHONHASHSEED=42


In [29]:
data = pd.read_csv("taxonomy_ranks/rank_0.csv", sep=",",)
entities = [entity[1][0] for entity in data[data.columns[1:]].iterrows()]
print(entities)

['http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C20181', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C1908', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C17828', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C22188', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C14250', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C97325', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C12218', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C43431', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C3910', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C28428', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C20189', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C16612', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C20047', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C7057', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C26548', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C12913', 'http://ncicb.nci.nih.gov/

KeyError: 0

In [2]:
# Read a CSV file containing the entities we want to classify.
tax_and_sub = rdflib.Graph()
tax_and_sub.parse("./data/tax_NCIT.ttl")

nodes_result = list(tax_and_sub.query(
        'SELECT DISTINCT ?s WHERE { ?s ?p ?o. }'
        ))
nodes = [n[0].toPython() for n in nodes_result]
nodes.append('http://www.w3.org/2002/07/owl#Thing') # This it the top node, which is not retrieved by the query




In [7]:
nodes[:5]

['http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C19213',
 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C12692',
 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C49304',
 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C154328',
 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C168768']

In [None]:
# Define our knowledge graph (here: DBPedia SPARQL endpoint).
knowledge_graph = KG(
    "./data/tax_and_subset_NCIT.ttl",
)
# Create our transformer, setting the embedding & walking strategy.
transformer = RDF2VecTransformer(
    Word2Vec(epochs=10, workers=1),
    walkers=[RandomWalker(4, 10, with_reverse=True, n_jobs=6, random_state=SEED)],
    # verbose=1
)
# Get our embeddings.
embeddings, literals = transformer.fit_transform(knowledge_graph, nodes)
pprint(embeddings)
# [
#     array([ 1.5737595e-04,  1.1333118e-03, -2.9838676e-04,  ..., -5.3064007e-04,
#             4.3192197e-04,  1.4529384e-03], dtype=float32),
#     array([-5.9027621e-04,  6.1689125e-04, -1.1987977e-03,  ...,  1.1066757e-03,
#            -1.0603866e-05,  6.6087965e-04], dtype=float32),
#     array([ 7.9996325e-04,  7.2907173e-04, -1.9482171e-04,  ...,  5.6251377e-04,
#             4.1435464e-04,  1.4478950e-04], dtype=float32)
# ]


In [19]:
embeddings[:5]

array([[-2.48261187e-02,  2.68637924e-03,  8.18032101e-02,
         9.50704589e-02,  2.15978641e-02, -6.57694694e-03,
         2.78740022e-02,  4.60822880e-02, -8.37882757e-02,
         3.74557450e-02, -5.25668077e-02, -7.88679123e-02,
         4.59200926e-02,  5.90843409e-02, -3.26629952e-02,
        -2.96652783e-02,  5.52065531e-03,  4.68672626e-02,
        -4.40943316e-02,  8.22621211e-03,  6.92985728e-02,
        -1.87141616e-02,  4.10387293e-02,  3.19273546e-02,
        -1.20328879e-02,  2.85324026e-02,  4.48823615e-04,
         1.04119927e-02, -9.69174318e-03, -4.92214411e-02,
         6.86163176e-03, -4.59534712e-02, -7.97690731e-03,
         4.89961216e-03, -6.04640618e-02,  9.43439305e-02,
         1.01081446e-01, -3.84250805e-02, -3.54930311e-02,
         5.91934733e-02,  4.64259796e-02,  9.06899348e-02,
        -1.11045443e-01, -1.05426960e-01, -3.43118235e-02,
         5.55916503e-02, -6.30977936e-03, -1.50623396e-02,
         3.11171450e-02,  1.58575960e-02, -2.13227924e-0

In [4]:
df = pd.DataFrame(embeddings)

In [5]:
df.to_pickle('./data/embeddings/test2.pkl')

In [None]:
# Define our knowledge graph (here: DBPedia SPARQL endpoint).
knowledge_graph = KG(
    "./data/tax_and_subset_NCIT.ttl",
)
# Create our transformer, setting the embedding & walking strategy.
transformer = RDF2VecTransformer(
    Word2Vec(epochs=10),
    walkers=[RandomWalker(4, 10, with_reverse=False, n_jobs=3)],
    # verbose=1
)
# Get our embeddings.
embeddings, literals = transformer.fit_transform(knowledge_graph, nodes)
pprint(embeddings)
# [
#     array([ 1.5737595e-04,  1.1333118e-03, -2.9838676e-04,  ..., -5.3064007e-04,
#             4.3192197e-04,  1.4529384e-03], dtype=float32),
#     array([-5.9027621e-04,  6.1689125e-04, -1.1987977e-03,  ...,  1.1066757e-03,
#            -1.0603866e-05,  6.6087965e-04], dtype=float32),
#     array([ 7.9996325e-04,  7.2907173e-04, -1.9482171e-04,  ...,  5.6251377e-04,
#             4.1435464e-04,  1.4478950e-04], dtype=float32)
# ]

pprint(literals)

In [32]:
len(embeddings[0])

100