In [1]:
!pip install rdflib

Collecting rdflib
  Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.1/528.1 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting isodate<0.7.0,>=0.6.0
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.3.2
[0m

In [2]:
import rdflib
import numpy as np
import random


class TriplesDataset:
    def __init__(self, url_list: str) -> None:
        datapoints = []
        labels = []
        entities = set()
        relations = set()
        
        for url in url_list:
            graph_variable = rdflib.Graph()
            resultGraph = graph_variable.parse(url)
            for subject_item, predicate, object_item in resultGraph.triples((None, None, None)):
                if type(object_item) != rdflib.term.URIRef:
                    continue
                
                # add them to entities and relations
                entities.add(str(subject_item))
                entities.add(str(object_item))
                relations.add(str(predicate))
                
                # add them to datapoints
                datapoints.append(
                    tuple([str(subject_item), str(object_item)])
                )
                labels.append(predicate)
        
        self.entities = list(entities)
        self.relations = list(relations)
        self.datapoints = datapoints
        self.labels = labels
        
        print(f"Entites : {len(self.entities)}")
        print(f"Datapoints shape : {len(self.datapoints)}")
        print(f"Labels = {len(self.labels)}")
    
    def generate_negative_samples(self, count):
        links_set = set(self.datapoints)
        entities_set = self.entities
        entities_count = len(entities_set)
        
        negative_samples = set()
        while len(negative_samples) != count:
            head_index, tail_index = random.sample(range(entities_count), 2)
            head = entities_set[head_index]
            tail = entities_set[tail_index]
            possible_sample = tuple([head, tail])
            if possible_sample not in links_set:
                negative_samples.add(possible_sample)
        return list(negative_samples)

In [3]:
dataFilePath = [
    "/kaggle/input/bgs-dataset/625KGeologyMap_all.nt",
    "/kaggle/input/bgs-dataset/dataholdings.nt",
    "/kaggle/input/bgs-dataset/earth-material-class.nt",
    "/kaggle/input/bgs-dataset/geochronology.nt",
    "/kaggle/input/bgs-dataset/lexicon-named-rock-unit.nt"
]
dataset = TriplesDataset(dataFilePath)

Entites : 44740
Datapoints shape : 297285
Labels = 297285


In [4]:
from gensim.models import KeyedVectors

filePath = '/kaggle/input/knowledgegraphembeddings/nodeEmbeddings100.bin'
embeddings = KeyedVectors.load_word2vec_format(filePath, binary=True)

In [5]:
import torch
import torch.nn as nn

# Define the model
class Classifier(nn.Module):
    def __init__(self, dimension, outputs):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(dimension, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, outputs)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [6]:
import torch.optim as optim

# Define the training loop
def train(model, criterion, optimizer, train_loader, device):
    # Set the model to training mode
    model.train()

    # Loop over the training data
    for data, target in train_loader:
        # Move the data to the device
        data, target = data.to(device), target.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(data)

        # Compute the loss
        loss = criterion(output, target)

        # Backward pass
        loss.backward()

        # Update the parameters
        optimizer.step()

In [7]:
def embed(embeddings, datapoint):
    head, tail = datapoint
    head_embedding = embeddings[head]
    tail_embedding = embeddings[tail]
    return np.concatenate([head_embedding, tail_embedding])

In [8]:
# Set the random seed
torch.manual_seed(0)

# Define the training data
positive_samples = dataset.datapoints
negative_samples = dataset.generate_negative_samples(len(positive_samples))
all_samples = positive_samples + negative_samples
print(len(all_samples))
x = map(lambda sample : embed(embeddings, sample), all_samples)
x_train = torch.tensor(np.array(list(x)))

print(x_train.shape)


594570
torch.Size([594570, 200])


In [9]:
torch_labels = torch.tensor([1 for label in dataset.labels])
positive_labels = torch.ones_like(torch_labels)
negative_labels = torch.zeros_like(torch_labels)
all_labels = torch.cat([positive_labels, negative_labels])
y_train = torch.tensor(all_labels).float().unsqueeze(1)

print(y_train.shape)


torch.Size([594570, 1])


  """


In [10]:
from torch.utils.data import TensorDataset, DataLoader, random_split

dataset = torch.utils.data.TensorDataset(x_train, y_train)

# Define the sizes of the train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# make dataloaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=True)

In [11]:
# Define the input and output dimensions
input_dimension = 2 * embeddings.vectors.shape[1]
output_dimension = 1

# Define the model, criterion, optimizer, and device
model = Classifier(input_dimension, output_dimension)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model.to(device)

# Train the model
for epoch in range(50):
    train(model, criterion, optimizer, train_loader, device)
    print(f"Epoch {epoch+1} completed")

# Save the model
torch.save(model.state_dict(), "model.pth")


Epoch 1 completed
Epoch 2 completed
Epoch 3 completed
Epoch 4 completed
Epoch 5 completed
Epoch 6 completed
Epoch 7 completed
Epoch 8 completed
Epoch 9 completed
Epoch 10 completed
Epoch 11 completed
Epoch 12 completed
Epoch 13 completed
Epoch 14 completed
Epoch 15 completed
Epoch 16 completed
Epoch 17 completed
Epoch 18 completed
Epoch 19 completed
Epoch 20 completed
Epoch 21 completed
Epoch 22 completed
Epoch 23 completed
Epoch 24 completed
Epoch 25 completed
Epoch 26 completed
Epoch 27 completed
Epoch 28 completed
Epoch 29 completed
Epoch 30 completed
Epoch 31 completed
Epoch 32 completed
Epoch 33 completed
Epoch 34 completed
Epoch 35 completed
Epoch 36 completed
Epoch 37 completed
Epoch 38 completed
Epoch 39 completed
Epoch 40 completed
Epoch 41 completed
Epoch 42 completed
Epoch 43 completed
Epoch 44 completed
Epoch 45 completed
Epoch 46 completed
Epoch 47 completed
Epoch 48 completed
Epoch 49 completed
Epoch 50 completed


In [12]:
# Load the saved model
state_dict = torch.load(
    "/kaggle/working/model.pth",
    map_location=device
)
model = Classifier(input_dimension, output_dimension)
model.load_state_dict(state_dict)

# Evaluate the model on some test data
with torch.no_grad():
    val_accuracy = 0.0
    for test_input, label in val_loader:
        output = model(test_input)  # use the model to make predictions on the test data
        predictions = torch.tensor(output >= 0.5, dtype=float)
        val_accuracy += torch.sum(predictions == label)
    val_accuracy /= len(val_loader.dataset)
print(val_accuracy)

  


tensor(0.9858)
