In [1]:
!pip install rdflib

Collecting rdflib
  Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.1/528.1 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting isodate<0.7.0,>=0.6.0
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.3.2
[0m

## Generate Dataset
This class creates the dataset from a list of file links. A variation of this class is used in all the notebooks in order to create the dataset.

In [2]:
import rdflib
import numpy as np
import random


class TriplesDataset:
    def __init__(self, url_list: str) -> None:
        datapoints = []
        labels = []
        entities = set()
        relations = set()
        
        for url in url_list:
            graph_variable = rdflib.Graph()
            resultGraph = graph_variable.parse(url)
            for subject_item, predicate, object_item in resultGraph.triples((None, None, None)):
                if type(object_item) != rdflib.term.URIRef:
                    continue
                
                # add them to entities and relations
                entities.add(str(subject_item))
                entities.add(str(object_item))
                relations.add(str(predicate))
                
                # add them to datapoints
                datapoints.append(
                    tuple([str(subject_item), str(object_item)])
                )
                labels.append(predicate)
        
        self.entities = list(entities)
        self.relations = list(relations)
        self.datapoints = datapoints
        self.labels = labels
        
        print(f"Entites : {len(self.entities)}")
        print(f"Datapoints shape : {len(self.datapoints)}")
        print(f"Labels = {len(self.labels)}")
    
    def generate_negative_samples(self, count):
        links_set = set(self.datapoints)
        entities_set = self.entities
        entities_count = len(entities_set)
        
        negative_samples = set()
        while len(negative_samples) != count:
            head_index, tail_index = random.sample(range(entities_count), 2)
            head = entities_set[head_index]
            tail = entities_set[tail_index]
            possible_sample = tuple([head, tail])
            if possible_sample not in links_set:
                negative_samples.add(possible_sample)
        return list(negative_samples)
    
    def construct_hash(self):
        entity_hash = dict()
        for index, entity in enumerate(self.entities):
            entity_hash[entity] = index
        
        relation_hash = dict()
        for index, relation in enumerate(self.relations):
            relation_hash[relation] = index
        return entity_hash, relation_hash

# Create model
Create the base model that will be used to evaluate all the embeddings. The exact model does not matter since we expect better embeddings (embeddings that capture the semantics of the underlying data better) to perform better if we fix the model. Here we use a classifier model to perform link prediction, that is, given the embeddings of two nodes, the model tries to predict whether there exists some link between them. We use the simplest possible model that can make such a prediction to reduce the baseline to as low as possible.

In [3]:
import torch
import torch.nn as nn

# Define the model
class Classifier(nn.Module):
    def __init__(self, dimension, outputs):
        super(Classifier, self).__init__()
        self.fc = nn.Linear(dimension, outputs)

    def forward(self, x):
        x = torch.sigmoid(self.fc(x))
        return x

In [4]:
import torch
import torch.nn as nn

class TransE(nn.Module):
    def __init__(self, num_entities, num_relations, embedding_dim, hashing, margin):
        super(TransE, self).__init__()
        self.num_entities = num_entities
        self.num_relations = num_relations
        self.embedding_dim = embedding_dim
        self.entity_hash = hashing[0]
        self.relation_hash = hashing[1]
        self.margin = margin
        
        # define entity and relation embeddings
        self.entity_embeddings = nn.Embedding(num_entities, embedding_dim)
        self.relation_embeddings = nn.Embedding(num_relations, embedding_dim)
        
        # initialize embeddings
        nn.init.xavier_uniform_(self.entity_embeddings.weight.data)
        nn.init.xavier_uniform_(self.relation_embeddings.weight.data)
    
    def _calculate_loss(self, triples):
        score = []
        for triple in triples:
            head, relation, tail = triple
            head_embedding = self.entity_embeddings(torch.tensor(self.entity_hash[head]))
            relation_embedding = self.relation_embeddings(torch.tensor(self.relation_hash[relation]))
            tail_embedding = self.entity_embeddings(torch.tensor(self.entity_hash[tail]))
            distance = torch.linalg.norm(head_embedding + relation_embedding - tail_embedding)
            score.append(distance)
        return torch.tensor(score)
    
    def forward(self, positive_triples, negative_triples):
        heads, predicates, tails = positive_triples
        positive_score = self._calculate_loss(zip(heads, predicates, tails))
        negative_score = self._calculate_loss(negative_triples)
        
        # calculate loss
        loss = torch.mean(torch.max(
            torch.zeros_like(negative_score), 
            self.margin + positive_score - negative_score
        ))
        loss.requires_grad = True
        return loss

## Training and evaluation loop

In [5]:
import torch.optim as optim

# Define the training loop
def train(model, criterion, optimizer, train_loader, device):
    # Set the model to training mode
    model.train()

    # Loop over the training data
    for data, target in train_loader:
        # Move the data to the device
        data, target = data.to(device), target.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(data)

        # Compute the loss
        loss = criterion(output, target)

        # Backward pass
        loss.backward()

        # Update the parameters
        optimizer.step()

In [6]:
def evaluate(model, val_loader):
    # Evaluate the model on some test data
    model.eval()
    with torch.no_grad():
        val_accuracy = 0.0
        for test_input, label in val_loader:
            output = model(test_input)  # use the model to make predictions on the test data
            predictions = torch.tensor(output >= 0.5, dtype=float)
            val_accuracy += torch.sum(predictions == label)
        val_accuracy /= len(val_loader.dataset)
    return val_accuracy

In [7]:
def embed(embeddings, datapoint, indicator):
    head, tail = datapoint
    if indicator == "graph_walk":
        head_embedding = embeddings[head]
        tail_embedding = embeddings[tail]
    else:
        head, tail = embeddings.entity_hash[head], embeddings.entity_hash[tail]
        entity_embeddings = embeddings.entity_embeddings
        head_embedding = entity_embeddings(torch.tensor(head)).detach().numpy()
        tail_embedding = entity_embeddings(torch.tensor(tail)).detach().numpy()
    return np.concatenate([head_embedding, tail_embedding])

In [8]:
# Set the random seed
torch.manual_seed(0)

<torch._C.Generator at 0x7878d1ee12d0>

In [9]:
from torch.utils.data import TensorDataset, DataLoader, random_split

def generate_loaders(dataset, embeddings, indicator):
    # Define the training data
    positive_samples = dataset.datapoints
    negative_samples = dataset.generate_negative_samples(len(positive_samples))
    all_samples = positive_samples + negative_samples
    print("All samples size : ", len(all_samples))

    x = map(lambda sample : embed(embeddings, sample, indicator), all_samples)
    x_train = torch.tensor(np.array(list(x)))
    print("Training dataset size", x_train.shape)
    
    torch_labels = torch.tensor([1 for label in dataset.labels])
    positive_labels = torch.ones_like(torch_labels)
    negative_labels = torch.zeros_like(torch_labels)
    all_labels = torch.cat([positive_labels, negative_labels])
    y_train = torch.tensor(all_labels).float().unsqueeze(1)
    print("Training labels size : ", y_train.shape)

    dataset = torch.utils.data.TensorDataset(x_train, y_train)

    # Define the sizes of the train and validation sets
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    # make dataloaders
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=128, shuffle=True)
    
    return train_loader, val_loader

In [10]:
def train_and_evaluate(dataset, embeddings, embeddings_length, indicator):
    # Define the input and output dimensions
    input_dimension = 2 * embeddings_length
    output_dimension = 1

    # Define the model, criterion, optimizer, and device
    model = Classifier(input_dimension, output_dimension)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters())
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Move the model to the device
    model.to(device)
    
    # generate data loaders
    train_loader, val_loader = generate_loaders(dataset, embeddings, indicator)
    
    # Train the model
    patience = 5
    best_valid_accuracy = 0
    epochs_without_improvement = 0
    for epoch in range(100):
        train(model, criterion, optimizer, train_loader, device)
        val_accuracy = evaluate(model, val_loader)
        print(f"Epoch {epoch+1} completed Validation accuracy : {int(val_accuracy * 1e4) / 1e2}")

        # Save the model
        if val_accuracy > best_valid_accuracy:
            best_valid_accuracy = val_accuracy
            epochs_without_improvement = 0
            torch.save(model.state_dict(), f"model_state_{indicator}.pth")
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement == patience:
                print(f'Early stopping after {epoch} epochs')
                break
    print(f"Best Validation Accuracy : {best_valid_accuracy}")
    

# Create dataset

In [11]:
dataFilePath = [
    "/kaggle/input/bgs-dataset/625KGeologyMap_all.nt",
    "/kaggle/input/bgs-dataset/dataholdings.nt",
    "/kaggle/input/bgs-dataset/earth-material-class.nt",
    "/kaggle/input/bgs-dataset/geochronology.nt",
    "/kaggle/input/bgs-dataset/lexicon-named-rock-unit.nt"
]
dataset = TriplesDataset(dataFilePath)

Entites : 44740
Datapoints shape : 297285
Labels = 297285


## Evaluate random embeddings
The TransE model starts with random embeddings for each entity. By using the raw embeddings and not using the trained embeddings, we can effective test how well the classifier model works on random embeddings and then set a baseline for other trained embeddings

In [12]:
num_entities = len(dataset.entities)
num_relations = len(dataset.relations)
embedding_dim = 100
hashing = dataset.construct_hash()
margin = 0.5
randomEmbeddings = TransE(num_entities, num_relations, embedding_dim, hashing, margin)

In [13]:
train_and_evaluate(
    dataset, 
    randomEmbeddings, 
    embedding_dim,
    "random"
)

All samples size :  594570
Training dataset size torch.Size([594570, 200])
Training labels size :  torch.Size([594570, 1])


  


Epoch 1 completed Validation accuracy : 65.59
Epoch 2 completed Validation accuracy : 65.98
Epoch 3 completed Validation accuracy : 66.37
Epoch 4 completed Validation accuracy : 67.24
Epoch 5 completed Validation accuracy : 68.15
Epoch 6 completed Validation accuracy : 68.87
Epoch 7 completed Validation accuracy : 69.35
Epoch 8 completed Validation accuracy : 69.72
Epoch 9 completed Validation accuracy : 69.87
Epoch 10 completed Validation accuracy : 70.03
Epoch 11 completed Validation accuracy : 70.18
Epoch 12 completed Validation accuracy : 70.33
Epoch 13 completed Validation accuracy : 70.43
Epoch 14 completed Validation accuracy : 70.6
Epoch 15 completed Validation accuracy : 70.6
Epoch 16 completed Validation accuracy : 70.54
Epoch 17 completed Validation accuracy : 70.59
Epoch 18 completed Validation accuracy : 70.65
Epoch 19 completed Validation accuracy : 70.68
Epoch 20 completed Validation accuracy : 70.72
Epoch 21 completed Validation accuracy : 70.79
Epoch 22 completed Valid

## Evaluate the TransE Embeddings

In [14]:
num_entities = len(dataset.entities)
num_relations = len(dataset.relations)
embedding_dim = 100
hashing = dataset.construct_hash()
margin = 0.5
transeEmbeddings = TransE(num_entities, num_relations, embedding_dim, hashing, margin)
state_dict = torch.load("/kaggle/input/knowledgegraphembeddings/transeEmbeddings.pth")
transeEmbeddings.load_state_dict(state_dict)

<All keys matched successfully>

In [15]:
train_and_evaluate(
    dataset, 
    transeEmbeddings, 
    embedding_dim,
    "transE"
)

All samples size :  594570
Training dataset size torch.Size([594570, 200])
Training labels size :  torch.Size([594570, 1])


  


Epoch 1 completed Validation accuracy : 67.01
Epoch 2 completed Validation accuracy : 68.23
Epoch 3 completed Validation accuracy : 69.03
Epoch 4 completed Validation accuracy : 69.29
Epoch 5 completed Validation accuracy : 69.6
Epoch 6 completed Validation accuracy : 69.91
Epoch 7 completed Validation accuracy : 70.17
Epoch 8 completed Validation accuracy : 70.3
Epoch 9 completed Validation accuracy : 70.55
Epoch 10 completed Validation accuracy : 70.76
Epoch 11 completed Validation accuracy : 70.97
Epoch 12 completed Validation accuracy : 71.03
Epoch 13 completed Validation accuracy : 71.27
Epoch 14 completed Validation accuracy : 71.39
Epoch 15 completed Validation accuracy : 71.5
Epoch 16 completed Validation accuracy : 71.56
Epoch 17 completed Validation accuracy : 71.55
Epoch 18 completed Validation accuracy : 71.62
Epoch 19 completed Validation accuracy : 71.72
Epoch 20 completed Validation accuracy : 71.75
Epoch 21 completed Validation accuracy : 71.82
Epoch 22 completed Valida

## Evaluate the Rdf2Vec Embeddings

In [16]:
from gensim.models import KeyedVectors

filePath = '/kaggle/input/knowledgegraphembeddings/nodeEmbeddings100.bin'
embeddings = KeyedVectors.load_word2vec_format(filePath, binary=True)

In [17]:
train_and_evaluate(
    dataset, 
    embeddings, 
    embeddings.vectors.shape[1], 
    "graph_walk"
)

All samples size :  594570
Training dataset size torch.Size([594570, 200])
Training labels size :  torch.Size([594570, 1])


  


Epoch 1 completed Validation accuracy : 92.44
Epoch 2 completed Validation accuracy : 92.64
Epoch 3 completed Validation accuracy : 92.73
Epoch 4 completed Validation accuracy : 92.76
Epoch 5 completed Validation accuracy : 92.8
Epoch 6 completed Validation accuracy : 92.74
Epoch 7 completed Validation accuracy : 92.84
Epoch 8 completed Validation accuracy : 92.88
Epoch 9 completed Validation accuracy : 92.86
Epoch 10 completed Validation accuracy : 92.86
Epoch 11 completed Validation accuracy : 92.85
Epoch 12 completed Validation accuracy : 92.9
Epoch 13 completed Validation accuracy : 92.89
Epoch 14 completed Validation accuracy : 92.82
Epoch 15 completed Validation accuracy : 92.9
Epoch 16 completed Validation accuracy : 92.88
Epoch 17 completed Validation accuracy : 92.89
Early stopping after 16 epochs
Best Validation Accuracy : 0.9290663599967957
