In [1]:
!pip install rdflib

Collecting rdflib
  Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.1/528.1 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isodate<0.7.0,>=0.6.0
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.3.2
[0m

In [2]:
import rdflib
import numpy as np
import random


class TriplesDataset:
    def __init__(self, url_list: str) -> None:
        datapoints = []
        labels = []
        entities = set()
        relations = set()
        
        for url in url_list:
            graph_variable = rdflib.Graph()
            resultGraph = graph_variable.parse(url)
            for subject_item, predicate, object_item in resultGraph.triples((None, None, None)):
                if type(object_item) != rdflib.term.URIRef:
                    continue
                
                # add them to entities and relations
                entities.add(str(subject_item))
                entities.add(str(object_item))
                relations.add(str(predicate))
                
                # add them to datapoints
                datapoints.append(
                    tuple([str(subject_item), str(predicate), str(object_item)])
                )
                labels.append(predicate)
        
        self.entities = list(entities)
        self.relations = list(relations)
        self.datapoints = datapoints
        
    def construct_hash(self):
        entity_hash = dict()
        for index, entity in enumerate(self.entities):
            entity_hash[entity] = index
        
        relation_hash = dict()
        for index, relation in enumerate(self.relations):
            relation_hash[relation] = index
        return entity_hash, relation_hash

In [3]:
data_file_path = [
    "/kaggle/input/bgs-dataset/625KGeologyMap_all.nt",
    "/kaggle/input/bgs-dataset/dataholdings.nt",
    "/kaggle/input/bgs-dataset/earth-material-class.nt",
    "/kaggle/input/bgs-dataset/geochronology.nt",
    "/kaggle/input/bgs-dataset/lexicon-named-rock-unit.nt"
]
dataset = TriplesDataset(data_file_path)

In [4]:
 def generate_negative_samples(dataset, positive_samples):
    entities = set(dataset.entities)
    entities_count = len(entities)
    links_set = set(dataset.datapoints)
    negative_samples = set()
    
    heads, predicates, tails = positive_samples
    for triple in zip(heads, predicates, tails):
        subject_item, predicate, object_item = triple
        choice = random.sample([0, 1], 1)[0]
        if choice == 0:
            while True:
                index = random.sample(range(entities_count), 1)[0]
                corrupted_triple = tuple([dataset.entities[index], predicate, object_item])
                if corrupted_triple not in links_set and corrupted_triple not in negative_samples:
                    negative_samples.add(corrupted_triple)
                    break
        else:
            while True:
                index = random.sample(range(entities_count), 1)[0]
                corrupted_triple = tuple([subject_item, predicate, dataset.entities[index]])
                if corrupted_triple not in links_set and corrupted_triple not in negative_samples:
                    negative_samples.add(corrupted_triple)
                    break
    return list(negative_samples)

In [5]:
import torch
import torch.nn as nn

class TransE(nn.Module):
    def __init__(self, num_entities, num_relations, embedding_dim, hashing, margin):
        super(TransE, self).__init__()
        self.num_entities = num_entities
        self.num_relations = num_relations
        self.embedding_dim = embedding_dim
        self.entity_hash = hashing[0]
        self.relation_hash = hashing[1]
        self.margin = margin
        
        # define entity and relation embeddings
        self.entity_embeddings = nn.Embedding(num_entities, embedding_dim)
        self.relation_embeddings = nn.Embedding(num_relations, embedding_dim)
        
        # initialize embeddings
        nn.init.xavier_uniform_(self.entity_embeddings.weight.data)
        nn.init.xavier_uniform_(self.relation_embeddings.weight.data)
    
    def _calculate_loss(self, triples):
        score = []
        for triple in triples:
            head, relation, tail = triple
            head_embedding = self.entity_embeddings(torch.tensor(self.entity_hash[head]))
            relation_embedding = self.relation_embeddings(torch.tensor(self.relation_hash[relation]))
            tail_embedding = self.entity_embeddings(torch.tensor(self.entity_hash[tail]))
            distance = torch.linalg.norm(head_embedding + relation_embedding - tail_embedding)
            score.append(distance)
        return torch.tensor(score)
    
    def forward(self, positive_triples, negative_triples):
        heads, predicates, tails = positive_triples
        positive_score = self._calculate_loss(zip(heads, predicates, tails))
        negative_score = self._calculate_loss(negative_triples)
#         positive_head = self.entity_embeddings(positive_triplets[:, 0])
#         positive_relation = self.relation_embeddings(positive_triplets[:, 1])
#         positive_tail = self.entity_embeddings(positive_triplets[:, 2])
#         positive_score = torch.norm(positive_head + positive_relation - positive_tail, dim=1, p=2)
        
#         negative_head = self.entity_embeddings(negative_triplets[:, 0])
#         negative_relation = self.relation_embeddings(negative_triplets[:, 1])
#         negative_tail = self.entity_embeddings(negative_triplets[:, 2])
#         negative_score = torch.norm(negative_head + negative_relation - negative_tail, dim=1, p=2)
        
        # calculate loss
        loss = torch.mean(torch.max(
            torch.zeros_like(negative_score), 
            self.margin + positive_score - negative_score
        ))
        loss.requires_grad = True
        return loss


In [6]:
def train(epochs, model, dataset, loader, optimizer, device):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters())
    for epoch in range(epochs):
        running_loss = 0.0
        for positive_triples in loader:
            negative_triples = generate_negative_samples(dataset, positive_triples)
            optimizer.zero_grad()
            loss = model(positive_triples, negative_triples)
            running_loss += loss.item()
            loss.backward()
            optimizer.step()

        epoch_loss = running_loss / len(loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

In [7]:
import numpy as np
from torch.utils.data import DataLoader

positive_samples = dataset.datapoints
loader = DataLoader(positive_samples, batch_size=4096, shuffle=True)

In [8]:
num_entities = len(dataset.entities)
num_relations = len(dataset.relations)
embedding_dim = 100
hashing = dataset.construct_hash()
margin = 1
model = TransE(num_entities, num_relations, embedding_dim, hashing, margin)


In [9]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

TransE(
  (entity_embeddings): Embedding(44740, 100)
  (relation_embeddings): Embedding(22, 100)
)

In [10]:
train(50, model, dataset, loader, optimizer, device)

Epoch 1/50, Loss: 0.9998
Epoch 2/50, Loss: 0.9998
Epoch 3/50, Loss: 0.9998
Epoch 4/50, Loss: 0.9998
Epoch 5/50, Loss: 0.9998
Epoch 6/50, Loss: 0.9998
Epoch 7/50, Loss: 0.9998
Epoch 8/50, Loss: 0.9998
Epoch 9/50, Loss: 0.9998
Epoch 10/50, Loss: 0.9998
Epoch 11/50, Loss: 0.9998
Epoch 12/50, Loss: 0.9998
Epoch 13/50, Loss: 0.9998
Epoch 14/50, Loss: 0.9998
Epoch 15/50, Loss: 0.9998
Epoch 16/50, Loss: 0.9998
Epoch 17/50, Loss: 0.9998
Epoch 18/50, Loss: 0.9998
Epoch 19/50, Loss: 0.9998
Epoch 20/50, Loss: 0.9998
Epoch 21/50, Loss: 0.9998
Epoch 22/50, Loss: 0.9998
Epoch 23/50, Loss: 0.9998
Epoch 24/50, Loss: 0.9998
Epoch 25/50, Loss: 0.9998
Epoch 26/50, Loss: 0.9998
Epoch 27/50, Loss: 0.9998
Epoch 28/50, Loss: 0.9998
Epoch 29/50, Loss: 0.9998
Epoch 30/50, Loss: 0.9998
Epoch 31/50, Loss: 0.9998
Epoch 32/50, Loss: 0.9998
Epoch 33/50, Loss: 0.9998
Epoch 34/50, Loss: 0.9998
Epoch 35/50, Loss: 0.9998
Epoch 36/50, Loss: 0.9998
Epoch 37/50, Loss: 0.9998
Epoch 38/50, Loss: 0.9998
Epoch 39/50, Loss: 0.

In [11]:
torch.save(model.state_dict(), 'model_weights.pth')