In [None]:
!wget https://huggingface.co/datasets/VLyb/WN18RR
!wget https://download.microsoft.com/download/8/7/0/8700516A-AB3D-4850-B4BB-805C515AECE1/FB15K-237.2.zip



In [2]:
!unzip /content/FB15K-237.2.zip -d /content/FB15K-237

Archive:  /content/FB15K-237.2.zip
  inflating: /content/FB15K-237/Release/MSR-LA_Data_Full Rights_FB15K-237 Knowledge Base Completion Dataset (2650).docx  
  inflating: /content/FB15K-237/Release/README.txt  
  inflating: /content/FB15K-237/Release/test.txt  
  inflating: /content/FB15K-237/Release/text_cvsc.txt  
  inflating: /content/FB15K-237/Release/text_emnlp.txt  
  inflating: /content/FB15K-237/Release/train.txt  
  inflating: /content/FB15K-237/Release/valid.txt  


In [6]:
fb15kPathTrain = "/content/FB15K-237/Release/train.txt"
fb15kPathTest = "/content/FB15K-237/Release/test.txt"
fb15kPathVal = "/content/FB15K-237/Release/valid.txt"

#Complex

In [None]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.1.0+cu121
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [None]:

# Helper function for visualization.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt


def visualize_graph(G, color):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    nx.draw_networkx(G, pos=nx.spring_layout(G, seed=42), with_labels=False,
                     node_color=color, cmap="Set2")
    plt.show()


def visualize_embedding(h, color, epoch=None, loss=None):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    h = h.detach().cpu().numpy()
    plt.scatter(h[:, 0], h[:, 1], s=140, c=color, cmap="Set2")
    if epoch is not None and loss is not None:
        plt.xlabel(f'Epoch: {epoch}, Loss: {loss.item():.4f}', fontsize=16)
    plt.show()

In [None]:
from torch_geometric.datasets import FB15k_237
from torch_geometric.nn import ComplEx, DistMult, RotatE, TransE
import torch.optim as optim

import os

device = 'cuda' if torch.cuda.is_available() else 'cpu'
path = os.path.join('data', 'FB15k')


In [None]:
train_data = FB15k_237(path, split='train')[0].to(device)
val_data = FB15k_237(path, split='val')[0].to(device)
test_data = FB15k_237(path, split='test')[0].to(device)

model = ComplEx(
    num_nodes=train_data.num_nodes,
    num_relations=train_data.num_edge_types,
    hidden_channels=50,
).to(device)

loader = model.loader(
    head_index=train_data.edge_index[0],
    rel_type=train_data.edge_type,
    tail_index=train_data.edge_index[1],
    batch_size=1000,
    shuffle=True,
)

optimizer = optim.Adagrad(model.parameters(), lr=0.001, weight_decay=1e-6)


Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/FB15k-237/train.txt
Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/FB15k-237/valid.txt
Downloading https://raw.githubusercontent.com/villmow/datasets_knowledge_embedding/master/FB15k-237/test.txt
Processing...
Done!


In [None]:
def train():
    model.train()
    total_loss = total_examples = 0
    for head_index, rel_type, tail_index in loader:
        optimizer.zero_grad()
        loss = model.loss(head_index, rel_type, tail_index)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * head_index.numel()
        total_examples += head_index.numel()
    return total_loss / total_examples

@torch.no_grad()
def test(data):
    model.eval()
    return model.test(
        head_index=data.edge_index[0],
        rel_type=data.edge_type,
        tail_index=data.edge_index[1],
        batch_size=20000,
        k=10,
    )


In [None]:
for epoch in range(1, 501):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
    # if epoch % 25 == 0:
    #     rank, mrr, hits = test(val_data)
    #     print(f'Epoch: {epoch:03d}, Val Mean Rank: {rank:.2f}, '
    #           f'Val MRR: {mrr:.4f}, Val Hits@10: {hits:.4f}')

rank, mrr, hits_at_10 = test(test_data)
print(f'Test Mean Rank: {rank:.2f}, Test MRR: {mrr:.4f}, '
      f'Test Hits@10: {hits_at_10:.4f}')

#Transformers

In [3]:
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torchtext.vocab import build_vocab_from_iterator
import torch.optim as optim
import torch.nn.functional as F


In [80]:
class Transformer(nn.Module):
  def __init__(self, vocab_size:int, input_dim:int, num_heads:int, num_encoder_layers: int, ff_dim:int, dropout: float):
    super().__init__()
    # self.posEncoder = None # Check without pos encoding for now
    self.embedding = nn.Embedding(vocab_size, input_dim)
    encoderLayer = TransformerEncoderLayer(input_dim, num_heads, ff_dim, dropout, )
    self.encoder = TransformerEncoder(encoderLayer, num_encoder_layers)
    self.maskedGen = nn.Linear(input_dim, vocab_size)
    self.score = nn.Linear(input_dim, 1)

  def forward(self, x, mask, method):
    embeds = self.embedding(x)
    if(method == 0):
      return self.score(self.encoder(embeds))
    else:
      return self.maskedGen(self.encoder(embeds))

def read_data(path):
  return pd.read_csv(path, delimiter="\t", names=["s", "r", "o"])

class CustomDataset(Dataset):
  def __init__(self, dataPath, vocab, transform=None, method = 1):

    self.dataset = read_data(dataPath)
    self.transform = transform
    self.vocab = vocab
    self.method = method

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()
    point = self.dataset.iloc[idx]
    sample = None
    if(self.method == 1): # masked generation

      sample = {
          "x": torch.tensor(vocab(["<cls>", point["s"], "<sep1>" ,point["r"],"<sep2>" ,"<mask>", "<end>"])),
          "y":  F.one_hot(torch.tensor(vocab([point["o"]])[0]), len(vocab))
      }
    if(self.transform):
      sample = self.transform(sample)
    return sample


In [82]:
config2 = {
    "train": fb15kPathTrain,
    "test": fb15kPathTest,
    "val": fb15kPathVal,
}

config = config2

def yieldTokens(data_iter):
  for i,row in data_iter.iterrows():
    yield [row["s"], row["r"], row["o"]]

def yieldSubjectsObjects(data_iter, vocab):
  for i,row in data_iter.iterrows():
    yield vocab([row["s"], row["o"]])

data_iter = read_data(config["train"])
vocab = build_vocab_from_iterator(yieldTokens(data_iter), specials=["<cls>", "<sep1>", "<sep2>", "<mask>", "<end>", "<unk>"])
vocab.set_default_index(vocab['<unk>'])
subObj = yieldSubjectsObjects(data_iter)
subjects = subObj[0]
objects = subObj[1]

In [83]:
train_dataset = CustomDataset(config["train"],vocab)
val_dataset = CustomDataset(config["val"],vocab)
test_dataset = CustomDataset(config["test"],vocab)
train_dataloader = DataLoader(train_dataset, batch_size = 16, num_workers=2, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size = 16, num_workers=2, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size = 16, num_workers=2, shuffle=True)

In [84]:

model = Transformer(len(vocab), 32, 2, 2, 32, 0)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adagrad(model.parameters(), lr=0.001, weight_decay=1e-6)




In [86]:
def train(model, loader, method):
  model.train()
  total_loss = total_examples = 0
  for batchIndex, data in enumerate(loader):
    x = data["x"]
    y = data["y"]
    out = model(x, None, method)
    loss = criterion(out, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss += float(loss.item())
    total_examples += 1

    if(batchIndex%50 == 0):
      print(f"epoch {epoch} | {batchIndex}/{len(loader)} | loss: {loss.item()}")
        # print(f"epoch {epoch} | {total_examples}/{len(loader)} | loss: {total_loss / total_examples}")
  # return total_loss / total_examples

@torch.no_grad()
def val(model,loader):
  model.eval()
  total_loss = total_examples = 0
  for batchIndex, data in enumerate(loader):
    x = data["x"]
    y = data["y"]
    out = model(x, None, method)
    loss = criterion(out, y)

    total_loss += float(loss.item())
    total_examples += 1
    if(batchIndex%50 == 0):
      print(f"val epoch {epoch} | {batchIndex}/{len(loader)} | loss: {loss.item()}")
  return total_loss


In [87]:
epochs = 3
method = 1
for epoch in range(epochs):
  train(model, train_dataloader, method)
  val(model, val_dataloader)
  print('-' * 89)


epoch 0 | 0/17008 | loss: 2.0697970390319824
epoch 0 | 50/17008 | loss: 1.4791767597198486
epoch 0 | 100/17008 | loss: 1.1832345724105835
epoch 0 | 150/17008 | loss: 0.9659128189086914
epoch 0 | 200/17008 | loss: 0.8294748663902283
epoch 0 | 250/17008 | loss: 0.7142818570137024
epoch 0 | 300/17008 | loss: 0.629608154296875
epoch 0 | 350/17008 | loss: 0.5630539059638977
epoch 0 | 400/17008 | loss: 0.5139343738555908
epoch 0 | 450/17008 | loss: 0.4645530581474304
epoch 0 | 500/17008 | loss: 0.4176269471645355
epoch 0 | 550/17008 | loss: 0.38573575019836426
epoch 0 | 600/17008 | loss: 0.3586159646511078
epoch 0 | 650/17008 | loss: 0.32957229018211365
epoch 0 | 700/17008 | loss: 0.3071689009666443
epoch 0 | 750/17008 | loss: 0.2885400950908661
epoch 0 | 800/17008 | loss: 0.271639347076416
epoch 0 | 850/17008 | loss: 0.2541411817073822
epoch 0 | 900/17008 | loss: 0.2382342666387558
epoch 0 | 950/17008 | loss: 0.22699777781963348
epoch 0 | 1000/17008 | loss: 0.21522490680217743
epoch 0 | 105

KeyboardInterrupt: 

In [None]:
  # emb size mismatch
# train loop
# loss maybe

# method 0

def hits():
  for batch in test_dataloader:
