In [1]:
!pip install rdflib

Collecting rdflib
  Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.1/528.1 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting isodate<0.7.0,>=0.6.0
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.3.2
[0m

# Create Dataset

In [2]:
import rdflib
import numpy as np
import random


class TriplesDataset:
    def __init__(self, url_list: str) -> None:
        datapoints = []
        labels = []
        entities = set()
        relations = set()
        
        for url in url_list:
            graph_variable = rdflib.Graph()
            resultGraph = graph_variable.parse(url)
            for subject_item, predicate, object_item in resultGraph.triples((None, None, None)):
                if type(object_item) != rdflib.term.URIRef:
                    continue
                
                # add them to entities and relations
                entities.add(str(subject_item))
                entities.add(str(object_item))
                relations.add(str(predicate))
                
                # add them to datapoints
                datapoints.append(
                    tuple([str(subject_item), str(predicate), str(object_item)])
                )
                labels.append(predicate)
        
        self.entities = list(entities)
        self.relations = list(relations)
        self.datapoints = datapoints
        
    def construct_hash(self):
        entity_hash = dict()
        for index, entity in enumerate(self.entities):
            entity_hash[entity] = index
        
        relation_hash = dict()
        for index, relation in enumerate(self.relations):
            relation_hash[relation] = index
        return entity_hash, relation_hash

In [3]:
data_file_path = [
    "/kaggle/input/bgs-dataset/625KGeologyMap_all.nt",
    "/kaggle/input/bgs-dataset/dataholdings.nt",
    "/kaggle/input/bgs-dataset/earth-material-class.nt",
    "/kaggle/input/bgs-dataset/geochronology.nt",
    "/kaggle/input/bgs-dataset/lexicon-named-rock-unit.nt"
]
dataset = TriplesDataset(data_file_path)

# Generate Negative Samples for training #

In [4]:
 def generate_negative_samples(dataset, positive_samples):
    entities = set(dataset.entities)
    entities_count = len(entities)
    links_set = set(dataset.datapoints)
    negative_samples = set()
    
    heads, predicates, tails = positive_samples
    for triple in zip(heads, predicates, tails):
        subject_item, predicate, object_item = triple
        choice = random.sample([0, 1], 1)[0]
        if choice == 0:
            while True:
                index = random.sample(range(entities_count), 1)[0]
                corrupted_triple = tuple([dataset.entities[index], predicate, object_item])
                if corrupted_triple not in links_set and corrupted_triple not in negative_samples:
                    negative_samples.add(corrupted_triple)
                    break
        else:
            while True:
                index = random.sample(range(entities_count), 1)[0]
                corrupted_triple = tuple([subject_item, predicate, dataset.entities[index]])
                if corrupted_triple not in links_set and corrupted_triple not in negative_samples:
                    negative_samples.add(corrupted_triple)
                    break
    return list(negative_samples)

# TransE model

In [5]:
import torch
import torch.nn as nn

class TransE(nn.Module):
    def __init__(self, num_entities, num_relations, embedding_dim, hashing, margin):
        super(TransE, self).__init__()
        self.num_entities = num_entities
        self.num_relations = num_relations
        self.embedding_dim = embedding_dim
        self.entity_hash = hashing[0]
        self.relation_hash = hashing[1]
        self.margin = margin
        
        # define entity and relation embeddings
        self.entity_embeddings = nn.Embedding(num_entities, embedding_dim)
        self.relation_embeddings = nn.Embedding(num_relations, embedding_dim)
        
        # initialize embeddings
        nn.init.xavier_uniform_(self.entity_embeddings.weight.data)
        nn.init.xavier_uniform_(self.relation_embeddings.weight.data)
    
    def _calculate_loss(self, triples):
        score = []
        for triple in triples:
            head, relation, tail = triple
            head_embedding = self.entity_embeddings(torch.tensor(self.entity_hash[head]))
            relation_embedding = self.relation_embeddings(torch.tensor(self.relation_hash[relation]))
            tail_embedding = self.entity_embeddings(torch.tensor(self.entity_hash[tail]))
            distance = torch.linalg.norm(head_embedding + relation_embedding - tail_embedding)
            score.append(distance)
        return torch.tensor(score)
    
    def forward(self, positive_triples, negative_triples):
        heads, predicates, tails = positive_triples
        positive_score = self._calculate_loss(zip(heads, predicates, tails))
        negative_score = self._calculate_loss(negative_triples)
        
        # calculate loss
        loss = torch.mean(torch.max(
            torch.zeros_like(negative_score), 
            self.margin + positive_score - negative_score
        ))
        loss.requires_grad = True
        return loss


In [6]:
def train(model, optimizer, train_loader):
    running_loss = 0.0
    model.train()
    for positive_triples in train_loader:
        negative_triples = generate_negative_samples(dataset, positive_triples)
        optimizer.zero_grad()
        loss = model(positive_triples, negative_triples)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    return running_loss

In [7]:
def train_and_evaluate(model, dataset, loader, optimizer):
    optimizer = optim.Adam(model.parameters())
    
    patience = 5
    best_loss = float('inf')
    epochs_without_improvement = 0
    for epoch in range(100):
        running_loss = train(model, optimizer, loader)
        epoch_loss = running_loss / len(loader)
        
        print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.6f}")
        
        # Save the model
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            epochs_without_improvement = 0
            torch.save(model.state_dict(), "transeEmbeddings.pth")
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement == patience:
                print(f'Early stopping after {epoch + 1} epochs')
                break

In [8]:
import numpy as np
from torch.utils.data import DataLoader, random_split

positive_samples = dataset.datapoints

# make dataloaders
loader = DataLoader(positive_samples, batch_size=4096, shuffle=True)

In [9]:
import torch.optim as optim

num_entities = len(dataset.entities)
num_relations = len(dataset.relations)
embedding_dim = 100
hashing = dataset.construct_hash()
margin = 0.4
model = TransE(num_entities, num_relations, embedding_dim, hashing, margin)
optimizer = optim.Adam(model.parameters())

In [10]:
train_and_evaluate(model, dataset, loader, optimizer)

Epoch 1, Loss: 0.400072
Epoch 2, Loss: 0.400062
Epoch 3, Loss: 0.400057
Epoch 4, Loss: 0.400082
Epoch 5, Loss: 0.400087
Epoch 6, Loss: 0.400063
Epoch 7, Loss: 0.400068
Epoch 8, Loss: 0.400052
Epoch 9, Loss: 0.400028
Epoch 10, Loss: 0.400078
Epoch 11, Loss: 0.400045
Epoch 12, Loss: 0.400071
Epoch 13, Loss: 0.400073
Epoch 14, Loss: 0.400076
Early stopping after 14 epochs
