In [1]:
!pip install rdflib

Collecting rdflib
  Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.1/528.1 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting isodate<0.7.0,>=0.6.0
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.3.2
[0m

In [2]:
import rdflib
from collections import deque


class RDFGraph:
    def __init__(self, urlList: str) -> None:
        self.entities = set()
        self.relations = set()
        self.neighbours = dict()
        for url in urlList:
            graphVariable = rdflib.Graph()
            resultGraph = graphVariable.parse(url)
            for subjectItem, predicate, objectItem in resultGraph.triples((None, None, None)):
                if type(objectItem) != rdflib.term.URIRef:
                    continue
                self.entities.add(subjectItem)
                self.relations.add(predicate)
                self.entities.add(objectItem)
                existingNeighbours = self.neighbours.get(subjectItem, None)
                neighbour = tuple([predicate, objectItem])
                if existingNeighbours is None:
                    self.neighbours[subjectItem] = set([neighbour])
                else:
                    self.neighbours[subjectItem].add(neighbour)

    def _bfs(self, startNode, maxDepth) -> list:
        visited = set()
        sequences = []
        for reverseSequence in self._bfs_recursive(startNode, visited, maxDepth):
            sequences.append(list(reversed(reverseSequence)))
        return sequences
        
    
    def _bfs_recursive(self, startNode, visited, maxDepth) -> list:
        visited.add(startNode)
        neighbours = self.neighbours.get(startNode, None)
        if maxDepth == 0 or neighbours is None:
            return [[startNode]]
        
        sequences = []
        for neighbour in neighbours:
            neighbourRelation, neighbourNode = neighbour
            if neighbourNode not in visited:
                partialSequences = self._bfs_recursive(neighbourNode, visited, maxDepth - 1)
                for partialSequence in partialSequences:
                    sequences.append(partialSequence + [neighbourRelation, startNode])
        return sequences

    def generateSequences(self, maxDepth: int, entitiesCount: int) -> list:
        sequences = []
        for entity in self.entities:
            sequenceEntity = self._bfs(entity, maxDepth)
            sequences.extend(sequenceEntity)
            entitiesCount -= 1
            if entitiesCount == 0:
                break
        self.sequences = sequences
        return sequences


In [3]:
filePath = [
    "/kaggle/input/bgs-dataset/625KGeologyMap_all.nt",
    "/kaggle/input/bgs-dataset/dataholdings.nt",
    "/kaggle/input/bgs-dataset/earth-material-class.nt",
    "/kaggle/input/bgs-dataset/geochronology.nt",
    "/kaggle/input/bgs-dataset/lexicon-named-rock-unit.nt"
]
graph = RDFGraph(filePath)

In [4]:
maxDepth = 5  # Depth of bfs search
maxEntities = len(graph.entities)
sequences = graph.generateSequences(maxDepth, maxEntities)

In [5]:
import numpy as np
np.random.seed(42)

print("Total Entities : ", len(graph.entities))
print("Total Relations : ", len(graph.relations))

totalPaths = len(sequences)
print("Total paths : ", totalPaths)
print("A random path")

randomIndex = np.random.randint(totalPaths)
for node in sequences[randomIndex]:
    print(node)

Total Entities :  44740
Total Relations :  22
Total paths :  6288246
A random path
http://data.bgs.ac.uk/id/625KGeologyMap/Unit/bgsn_digmap20111213000021047_625k
http://data.bgs.ac.uk/ref/625KGeologyMap/hasLexicon
http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/ARGY
http://www.w3.org/2004/02/skos/core#broader
http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/DALN
http://www.w3.org/2004/02/skos/core#narrower
http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/GRAM
http://www.w3.org/2004/02/skos/core#narrower
http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/GRGS
http://www.w3.org/2004/02/skos/core#narrower
http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/EILDE


In [6]:
from gensim.models import Word2Vec

# Set the parameters for the Word2Vec model
size = 100  # Dimensionality of the word vectors
window = 5  # Context window size
min_count = 1  # Minimum frequency of words to consider
workers = 16  # Number of worker threads to use during training

model = Word2Vec(
    sequences, vector_size=size, window=window, min_count=min_count, workers=workers
)

In [7]:
wordVectors = model.wv
vocabulory = list(wordVectors.index_to_key)
vocabulorySize = len(vocabulory)
print("Entities embedded : ", vocabulorySize)
wordVectors.save_word2vec_format('embeddings.bin', binary=True)

Entities embedded :  44762


In [8]:
vocabulorySet = set(vocabulory)
entitySet = set(graph.entities)
vocabulorySet - entitySet

{rdflib.term.URIRef('http://data.bgs.ac.uk/ref/625KGeologyMap/hasEarthMaterialClass'),
 rdflib.term.URIRef('http://data.bgs.ac.uk/ref/625KGeologyMap/hasLexicon'),
 rdflib.term.URIRef('http://data.bgs.ac.uk/ref/625KGeologyMap/hasRank'),
 rdflib.term.URIRef('http://data.bgs.ac.uk/ref/Lexicon/hasBroaderPredominantAge'),
 rdflib.term.URIRef('http://data.bgs.ac.uk/ref/Lexicon/hasLateralEquivalent'),
 rdflib.term.URIRef('http://data.bgs.ac.uk/ref/Lexicon/hasObsoleteEquivalent'),
 rdflib.term.URIRef('http://data.bgs.ac.uk/ref/Lexicon/hasOldestAge'),
 rdflib.term.URIRef('http://data.bgs.ac.uk/ref/Lexicon/hasPredominantAge'),
 rdflib.term.URIRef('http://data.bgs.ac.uk/ref/Lexicon/hasRockUnitRank'),
 rdflib.term.URIRef('http://data.bgs.ac.uk/ref/Lexicon/hasYoungestAge'),
 rdflib.term.URIRef('http://data.bgs.ac.uk/ref/Lexicon/isObsoleteEquivalentOf'),
 rdflib.term.URIRef('http://purl.org/dc/terms/creator'),
 rdflib.term.URIRef('http://purl.org/dc/terms/publisher'),
 rdflib.term.URIRef('http://www

In [9]:
randomIndex = np.random.randint(vocabulorySize)
entity = vocabulory[randomIndex]
print(entity)
print(wordVectors[entity])

http://data.bgs.ac.uk/id/EarthMaterialClass/RockName/MTFMUD
[ 0.05051223 -0.00269511  0.05014386  0.05552666  0.03134386 -0.00833599
  0.02676786  0.08093793  0.00064853 -0.05209844 -0.01720972 -0.01213085
  0.02983971  0.02463203 -0.07036081  0.00903369 -0.0922478  -0.02551756
 -0.05268109 -0.04348169 -0.00510987  0.02686541 -0.00180825  0.02355583
 -0.01883297  0.0142932   0.04022423 -0.00457735 -0.03259301 -0.01344212
  0.02388262 -0.02227396 -0.01604591 -0.01576144  0.04005769  0.05544152
 -0.00677864 -0.0499577  -0.13268031  0.01241753  0.04449821 -0.05538702
 -0.04407227  0.03115947  0.09876963  0.03627842  0.00419507  0.00235987
  0.0238176   0.03689527 -0.00019717 -0.02697445 -0.05858924 -0.00171807
  0.05674632  0.00892013 -0.06397377  0.01252981 -0.10085972 -0.03364418
  0.03748882  0.03596101 -0.00332245 -0.02342508 -0.00198667  0.04544734
 -0.02647015  0.01009224  0.06952563  0.03690182  0.07049017  0.04660826
  0.00770841  0.00265805 -0.00318021  0.00313425  0.04185511 -0.