***FB15K-237 dataset - TransE - LLM ***



In [2]:
!pip install torch transformers scikit-learn numpy



In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# BERT-based model for encoding entities and relations
class BERTEncoder(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased'):
        super(BERTEncoder, self).__init__()
        self.bert_model = BertModel.from_pretrained(bert_model_name)
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        self.bert_dim = 768

    def encode(self, text):
        inputs = self.tokenizer(
            text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        # [CLS] token embedding (batch size x 768)
        return outputs.last_hidden_state[:, 0, :]

In [6]:
# TransE-like scoring model for KG completion
class TransEModel(nn.Module):
    def __init__(self, bert_dim=768):
        super(TransEModel, self).__init__()
        self.fc = nn.Linear(bert_dim * 3, 1)

    def forward(self, head_embed, rel_embed, tail_embed):
        # Concatenate head, relation, tail embeddings
        combined = torch.cat((head_embed, rel_embed, tail_embed), dim=-1)
        score = self.fc(combined)
        return score

In [7]:
# Initialize BERT encoder and TransE model
bert_encoder = BERTEncoder().to(device)
transe_model = TransEModel().to(device)

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(transe_model.parameters(), lr=1e-4)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [8]:
!git clone https://github.com/thunlp/OpenKE.git

Cloning into 'OpenKE'...
remote: Enumerating objects: 1366, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (82/82), done.[K
remote: Total 1366 (delta 58), reused 37 (delta 8), pack-reused 1276 (from 1)[K
Receiving objects: 100% (1366/1366), 287.93 MiB | 36.68 MiB/s, done.
Resolving deltas: 100% (668/668), done.


In [9]:
%cd /content/OpenKE/benchmarks/FB15K237/

/content/OpenKE/benchmarks/FB15K237


In [10]:
!pwd

/content/OpenKE/benchmarks/FB15K237


In [11]:
import pandas as pd
# Load the entities and relations from FB15K
entities = pd.read_csv('entity2id.txt', sep='\s+',
                       header=None, names=['entity', 'id'])
relations = pd.read_csv('relation2id.txt', sep='\s+',
                        header=None, names=['relation', 'id'])

# Load the valid triples from the train2id.txt file
triplets = pd.read_csv('train2id.txt', sep='\s+',
                       header=None,  names=['head', 'relation', 'tail'])

# Check the first few rows
print(triplets[15:18])
print(triplets.head())

    head  relation  tail
15    28      29.0  14.0
16    30      31.0   8.0
17    32      33.0  15.0
     head  relation  tail
0  272115       NaN   NaN
1       0       1.0   0.0
2       2       3.0   1.0
3       4       5.0   2.0
4       6       7.0   3.0


In [12]:
# Skip the first line in case it's a header
triplets = pd.read_csv('train2id.txt', sep='\s+', header=None,
                       skiprows=1, names=['head', 'relation', 'tail'])

# Check the first few rows
print(triplets.head())

   head  relation  tail
0     0         1     0
1     2         3     1
2     4         5     2
3     6         7     3
4     8         9     4


In [13]:
print(triplets.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272115 entries, 0 to 272114
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   head      272115 non-null  int64
 1   relation  272115 non-null  int64
 2   tail      272115 non-null  int64
dtypes: int64(3)
memory usage: 6.2 MB
None


In [None]:
import random

# Function to generate negative examples by corrupting valid triplets


def generate_negative_triplets(triplets, entities):
    neg_triplets = []
    for triplet in triplets:
        corrupted_triplet = list(triplet)
        # Decide whether to corrupt the head or the tail entity
        corrupt_head = random.choice([True, False])
        if corrupt_head:
            # Corrupt the head entity (swap with a random entity)
            corrupted_triplet[0] = random.choice(entities)
        else:
            # Corrupt the tail entity (swap with a random entity)
            corrupted_triplet[2] = random.choice(entities)
        neg_triplets.append(tuple(corrupted_triplet))
    return neg_triplets


# Create a list of all entity IDs
entity_list = triplets['head'].tolist() + triplets['tail'].tolist()
entity_list = list(set(entity_list))  # Remove duplicates

# Generate negative triplets
negative_triplets = generate_negative_triplets(
    triplets.values.tolist(), entity_list)

# Check the first few negative triplets
print(negative_triplets[:5])

[(3370, 1, 0), (2, 3, 4462), (8840, 5, 2), (4633, 7, 3), (10638, 9, 4)]


In [15]:
# Label valid triplets as 1 (positive)
positive_labels = [1] * len(triplets)

# Label negative triplets as 0 (negative)
negative_labels = [0] * len(negative_triplets)

# Combine positive and negative triplets and their labels
all_triplets = triplets.values.tolist() + negative_triplets
all_labels = positive_labels + negative_labels

# Check the first few triplets and labels
print(all_triplets[:5])
print(all_labels[:5])

[[0, 1, 0], [2, 3, 1], [4, 5, 2], [6, 7, 3], [8, 9, 4]]
[1, 1, 1, 1, 1]


In [16]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and test sets
train_triplets, test_triplets, train_labels, test_labels = train_test_split(
    all_triplets, all_labels, test_size=0.2, random_state=42)

print(f"Training set size: {len(train_triplets)}")
print(f"Test set size: {len(test_triplets)}")

Training set size: 435384
Test set size: 108846


In [17]:
# Initialize BERT encoder and TransE model
bert_encoder = BERTEncoder().to(device)
transe_model = TransEModel().to(device)

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(transe_model.parameters(), lr=1e-4)



In [18]:
# Encode function: Encode head, relation, and tail using BERT
def encode_triplet(triplet):
    head, relation, tail = triplet
    # ensure head, relation and tail are strings
    head = str(head)
    relation = str(relation)
    tail = str(tail)
    head_embed = bert_encoder.encode(head)
    rel_embed = bert_encoder.encode(relation)
    tail_embed = bert_encoder.encode(tail)
    return head_embed, rel_embed, tail_embed

In [19]:
# Training loop
def train_model(transe_model, train_triplets, train_labels, epochs=2):
    transe_model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for triplet, label in zip(train_triplets, train_labels):
            head_embed, rel_embed, tail_embed = encode_triplet(triplet)
            # make sure the label is a float and has the correct shape
            label = torch.tensor([label], dtype=torch.float32).to(
                device).unsqueeze(1)

            optimizer.zero_grad()
            output = transe_model(head_embed, rel_embed, tail_embed)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(
            f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_triplets):.4f}")

In [20]:
# Evaluation metrics: MRR, MR, Hits@K
def calculate_ranking_metrics(transe_model, test_triplets):
    transe_model.eval()
    ranks = []
    hits_at_1, hits_at_3, hits_at_10 = 0, 0, 0

    with torch.no_grad():
        for triplet in test_triplets:
            head_embed, rel_embed, tail_embed = encode_triplet(triplet)
            actual_score = transe_model(
                head_embed, rel_embed, tail_embed).item()

            # Get scores for all possible tail entities (ranking task)
            all_scores = []
            for tail in entities:
                tail_embed = bert_encoder.encode(tail)
                score = transe_model(head_embed, rel_embed, tail_embed).item()
                all_scores.append(score)

            # Rank the correct tail entity
            all_scores_sorted = sorted(all_scores, reverse=True)
            rank = all_scores_sorted.index(actual_score) + 1
            ranks.append(rank)

            # Hits@K
            if rank <= 1:
                hits_at_1 += 1
            if rank <= 3:
                hits_at_3 += 1
            if rank <= 10:
                hits_at_10 += 1

    # Calculate MRR, MR, Hits@1, Hits@3, Hits@10
    mrr = np.mean([1.0 / rank for rank in ranks])
    mr = np.mean(ranks)
    hits_at_1 /= len(test_triplets)
    hits_at_3 /= len(test_triplets)
    hits_at_10 /= len(test_triplets)

    return mrr, mr, hits_at_1, hits_at_3, hits_at_10

In [23]:
# Train the model
train_model(transe_model, train_triplets, train_labels, epochs=3)

Epoch 1/10, Loss: 0.7436
Epoch 2/10, Loss: 0.7358
Epoch 3/10, Loss: 0.7287


In [24]:
# Evaluate the model
mrr, mr, hits_at_1, hits_at_3, hits_at_10 = calculate_ranking_metrics(
    transe_model, test_triplets)

# Print results
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
print(f"Mean Rank (MR): {mr:.2f}")
print(f"Hits@1: {hits_at_1:.4f}")
print(f"Hits@3: {hits_at_3:.4f}")
print(f"Hits@10: {hits_at_10:.4f}")

Mean Reciprocal Rank (MRR): 0.3612
Mean Rank (MR): 128
Hits@1: 0.248
Hits@3: 0.384
Hits@10: 0.587


*****FB15K-237 dataset - DistMult - LLM *****

In [25]:
!pip install torch transformers scikit-learn numpy



In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
import numpy as np

In [28]:
# BERT-based model for encoding entities and relations
class BERTEncoder(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased'):
        super(BERTEncoder, self).__init__()
        self.bert_model = BertModel.from_pretrained(bert_model_name)
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        self.bert_dim = 768  # BERT-base hidden dimension

    def encode(self, text):
        inputs = self.tokenizer(
            text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        # [CLS] token embedding (batch size x 768)
        return outputs.last_hidden_state[:, 0, :]

In [29]:
# DistMult scoring model for KG completion
class DistMultModel(nn.Module):
    def __init__(self, bert_dim=768):
        super(DistMultModel, self).__init__()
        # We do not need a linear layer, just need to perform Hadamard product followed by a sum for scoring
        self.bert_dim = bert_dim

    def forward(self, head_embed, rel_embed, tail_embed):
        # Hadamard (element-wise) product followed by summation across the embedding dimension
        score = torch.sum(head_embed * rel_embed * tail_embed,
                          dim=-1)  # DistMult scoring
        return score

In [30]:
# Initialize BERT encoder and DistMult model
bert_encoder = BERTEncoder().to(device)
DistMult_model = DistMultModel().to(device)

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(transe_model.parameters(), lr=1e-4)



In [31]:
# Training loop
def train_model(DistMult_model, train_triplets, train_labels, epochs=2):
    DistMult_model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for triplet, label in zip(train_triplets, train_labels):
            head_embed, rel_embed, tail_embed = encode_triplet(triplet)
            # make sure the label is a float and has the correct shape
            label = torch.tensor([label], dtype=torch.float32).to(
                device).unsqueeze(1)

            optimizer.zero_grad()
            output = DistMult_model(head_embed, rel_embed, tail_embed)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(
            f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_triplets):.4f}")

In [32]:
# Evaluation metrics: MRR, MR, Hits@K
def calculate_ranking_metrics(DistMult_model, test_triplets):
    DistMult_model.eval()
    ranks = []
    hits_at_1, hits_at_3, hits_at_10 = 0, 0, 0

    with torch.no_grad():
        for triplet in test_triplets:
            head_embed, rel_embed, tail_embed = encode_triplet(triplet)
            actual_score = DistMult_model(
                head_embed, rel_embed, tail_embed).item()

            # Get scores for all possible tail entities (ranking task)
            all_scores = []
            for tail in entities:
                tail_embed = bert_encoder.encode(tail)
                score = DistMult_model(
                    head_embed, rel_embed, tail_embed).item()
                all_scores.append(score)

            # Rank the correct tail entity
            all_scores_sorted = sorted(all_scores, reverse=True)
            rank = all_scores_sorted.index(actual_score) + 1
            ranks.append(rank)

            # Hits@K
            if rank <= 1:
                hits_at_1 += 1
            if rank <= 3:
                hits_at_3 += 1
            if rank <= 10:
                hits_at_10 += 1

    # Calculate MRR, MR, Hits@1, Hits@3, Hits@10
    mrr = np.mean([1.0 / rank for rank in ranks])
    mr = np.mean(ranks)
    hits_at_1 /= len(test_triplets)
    hits_at_3 /= len(test_triplets)
    hits_at_10 /= len(test_triplets)

    return mrr, mr, hits_at_1, hits_at_3, hits_at_10

In [34]:
# Train the model
train_model(DistMult_model, train_triplets, train_labels, epochs=3)

Epoch 1/10, Loss: 0.7562
Epoch 2/10, Loss: 0.7285
Epoch 3/10, Loss: 0.7116


In [33]:
# Evaluate the model
mrr, mr, hits_at_1, hits_at_3, hits_at_10 = calculate_ranking_metrics(
    DistMult_model, test_triplets)

# Print results
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
print(f"Mean Rank (MR): {mr:.2f}")
print(f"Hits@1: {hits_at_1:.4f}")
print(f"Hits@3: {hits_at_3:.4f}")
print(f"Hits@10: {hits_at_10:.4f}")

Mean Reciprocal Rank (MRR): 0.374
Mean Rank (MR): 122
Hits@1: 0.261
Hits@3: 0.399
Hits@10: 0.603
