In [None]:
%%capture
!wget https://huggingface.co/datasets/VLyb/WN18RR
!wget https://download.microsoft.com/download/8/7/0/8700516A-AB3D-4850-B4BB-805C515AECE1/FB15K-237.2.zip


In [None]:
!unzip /content/FB15K-237.2.zip -d /content/FB15K-237

In [None]:
fb15kPathTrain = "/content/FB15K-237/Release/train.txt"
fb15kPathTest = "/content/FB15K-237/Release/test.txt"
fb15kPathVal = "/content/FB15K-237/Release/valid.txt"

#Complex

In [None]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

In [None]:

# Helper function for visualization.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt


def visualize_graph(G, color):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    nx.draw_networkx(G, pos=nx.spring_layout(G, seed=42), with_labels=False,
                     node_color=color, cmap="Set2")
    plt.show()


def visualize_embedding(h, color, epoch=None, loss=None):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    h = h.detach().cpu().numpy()
    plt.scatter(h[:, 0], h[:, 1], s=140, c=color, cmap="Set2")
    if epoch is not None and loss is not None:
        plt.xlabel(f'Epoch: {epoch}, Loss: {loss.item():.4f}', fontsize=16)
    plt.show()

In [None]:
from torch_geometric.datasets import FB15k_237, WordNet18RR
from torch_geometric.nn import ComplEx, DistMult, RotatE, TransE
from torch_geometric.transforms import RandomNodeSplit
import torch.optim as optim

import os

device = 'cuda' if torch.cuda.is_available() else 'cpu'
path = os.path.join('data', 'FB15k')


In [None]:
train_data = FB15k_237(path, split='train')[0].to(device)
val_data = FB15k_237(path, split='val')[0].to(device)
test_data = FB15k_237(path, split='test')[0].to(device)

# wordnet = WordNet18RR("./wordnet").to(device)
# train_data, val_data, test_data = RandomNodeSplit(
#         split= 'train_rest'
#     )(wordnet)

model = ComplEx(
    num_nodes=train_data.num_nodes,
    num_relations=train_data.num_edge_types,
    hidden_channels=50,
).to(device)

loader = model.loader(
    head_index=train_data.edge_index[0],
    rel_type=train_data.edge_type,
    tail_index=train_data.edge_index[1],
    batch_size=1000,
    shuffle=True,
)

optimizer = optim.Adagrad(model.parameters(), lr=0.001, weight_decay=1e-6)


In [None]:
def train():
    model.train()
    total_loss = total_examples = 0
    for head_index, rel_type, tail_index in loader:
        optimizer.zero_grad()
        loss = model.loss(head_index, rel_type, tail_index)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * head_index.numel()
        total_examples += head_index.numel()
    return total_loss / total_examples

@torch.no_grad()
def test(data, k = 10):
    model.eval()
    return model.test(
        head_index=data.edge_index[0],
        rel_type=data.edge_type,
        tail_index=data.edge_index[1],
        batch_size=20000,
        k=k,
    )


In [None]:
for epoch in range(1, 501):
    loss = train()
    if epoch % 25 == 0:
        rank, mrr, hits = test(val_data)
        print(f'Epoch: {epoch:03d}, Val Mean Rank: {rank:.2f}, '
              f'Val MRR: {mrr:.4f}, Val Hits@10: {hits:.4f}')



In [None]:
rank, mrr, hits_at_10 = test(test_data, k=10)
print(f'Test Mean Rank: {rank:.2f}, Test MRR: {mrr:.4f}, '
      f'Test Hits@10: {hits_at_10:.4f}')

In [None]:
rank, mrr, hits_at_1 = test(test_data, k=1)
print(f'Test Mean Rank: {rank:.2f}, Test MRR: {mrr:.4f}, '
      f'Test Hits@1: {hits_at_1:.4f}')

#Transformers

In [None]:
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torchtext.vocab import build_vocab_from_iterator
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np


In [None]:
# Assume professor didnt mean to ask for full contrastive loss other as its extremely slow
# Another alternative is, instead relevant embedding(cls/mask) is projected to vocab space like in MLM/BERT
def getInbatchNegative(batch, center, dataset=None, vocab=None):
  sample = dataset.sample(frac=1000/len(dataset))
  # sa = random.sample(self.vocab.items(), 1000)
  ret = [torch.tensor(vocab(["<cls>", center["s"], "<sep1>" ,center["r"],"<sep2>" ,data["o"], "<end>"])) for data in sample if data["o"] != center["o"]]
  # ret = []
  # for data in batch:
  #   newRow = center
  #   newRow[-2] = data[-2]
  #   newRow = torch.tensor(newRow)
  return torch.tensor(ret)


class Transformer(nn.Module):
  def __init__(self, vocab_size:int, input_dim:int, num_heads:int, num_encoder_layers: int, ff_dim:int, dropout: float):
    super().__init__()
    # self.posEncoder = None # Check without pos encoding for now
    self.input_dim = input_dim
    self.pos_encoder = PositionalEncoding(input_dim)
    self.embedding = nn.Embedding(vocab_size, input_dim)
    encoderLayer = TransformerEncoderLayer(input_dim, num_heads, ff_dim, dropout, )
    self.encoder = TransformerEncoder(encoderLayer, num_encoder_layers)
    self.maskedGen = nn.Linear(input_dim, vocab_size)
    self.score = nn.Linear(input_dim, vocab_size)
    self.init_weights()

  def init_weights(self) -> None:
    initrange = 0.1
    self.embedding.weight.data.uniform_(-initrange, initrange)
    self.score.bias.data.zero_()
    self.score.weight.data.uniform_(-initrange, initrange)
    self.maskedGen.bias.data.zero_()
    self.maskedGen.weight.data.uniform_(-initrange, initrange)

  def forward(self, x, mask, method):
    embeds = self.pos_encoder(self.embedding(x)* np.sqrt(self.input_dim))

    contextualEmbeds = self.encoder(embeds)
    # print("forward", x.shape, embeds.shape, contextualEmbeds.shape, self.score(contextualEmbeds[:,0]).shape, self.maskedGen(contextualEmbeds[:,-2]).shape)
    if(method == 0):
      cls = contextualEmbeds[:,0]
      return self.score(cls)
    else:
      maskEmb = contextualEmbeds[:,-2]
      return self.maskedGen(maskEmb)

class PositionalEncoding(nn.Module):

  def __init__(self, input_dim: int, dropout: float = 0.1, max_len: int = 7):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout)

    position = torch.arange(max_len).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, input_dim, 2) * (-np.log(10000.0) / input_dim))
    pe = torch.zeros(max_len, input_dim)
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    self.register_buffer('pe', pe)

  def forward(self, x: Tensor) -> Tensor:
    """
    Arguments:
        x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
    """
    # print("pos", x.shape, self.pe.shape)
    x = x + self.pe[:x.size(0)]
    return self.dropout(x)


In [None]:
def read_data(path):
  return pd.read_csv(path, delimiter="\t", names=["s", "r", "o"])

class CustomDataset(Dataset):
  def __init__(self, dataset, vocab, transform=None, method = 1):

    self.dataset = dataset
    self.transform = transform
    self.vocab = vocab
    self.method = method

  def __len__(self):
    return len(self.dataset)

  def sample(self, numSamples):
    return self.dataset.sample(frac=numSamples/len(self.dataset))



  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()
    point = self.dataset.iloc[idx]
    sample = None
    if(self.method == 1): # masked generation
      sample = {
          "x": torch.tensor(vocab(["<cls>", point["s"], "<sep1>" ,point["r"],"<sep2>" ,"<mask>", "<end>"])),
          "y":  F.one_hot(torch.tensor(vocab([point["o"]])[0]), len(vocab)).float()
      }
    else:
      sample = {
          "x": torch.tensor(vocab(["<cls>", point["s"], "<sep1>" ,point["r"],"<sep2>" ,point["o"], "<end>"])),
          "y":  F.one_hot(torch.tensor(vocab([point["o"]])[0]), len(vocab)).float()
      }

    if(self.transform):
      sample = self.transform(sample)
    return sample

In [None]:
config2 = {
    "train": fb15kPathTrain,
    "test": fb15kPathTest,
    "val": fb15kPathVal,
}

config = config2

def yieldTokens(data_iter):
  for i,row in data_iter.iterrows():
    yield [row["s"], row["r"], row["o"]]

def yieldSubjectsObjects(data_iter, vocab):
  for i,row in data_iter.iterrows():
    yield vocab([row["s"], row["o"]])

data_iter = read_data(config["train"])
vocab = build_vocab_from_iterator(yieldTokens(data_iter), specials=["<cls>", "<sep1>", "<sep2>", "<mask>", "<end>", "<unk>"])
vocab.set_default_index(vocab['<unk>'])
# subObj = yieldSubjectsObjects(data_iter)
# subjects = subObj[0]
# objects = subObj[1]

In [None]:
train_dataset = CustomDataset(read_data(config["train"]),vocab)
val_dataset = CustomDataset(read_data(config["val"]),vocab)
test_dataset = CustomDataset(read_data(config["test"]),vocab)
train_dataloader = DataLoader(train_dataset, batch_size = 256, num_workers=2, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size = 256, num_workers=2, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size = 256, num_workers=2, shuffle=True)

In [None]:

model = Transformer(
    dropout=0, vocab_size = len(vocab),
    input_dim = 64, num_heads = 2,
    num_encoder_layers = 4, ff_dim = 64
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adagrad(model.parameters(), lr=0.001, weight_decay=1e-6)


In [None]:
def train(model, loader, method):
  model.train()
  total_loss = total_examples = 0
  for batchIndex, data in enumerate(loader):
    x = data["x"]
    y = data["y"]
    out = model(x, None, method)
    loss = criterion(out, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss += float(loss.item())
    total_examples += 1

    if(batchIndex%50 == 0):
      print(f"epoch {epoch} | {batchIndex}/{len(loader)} | loss: {total_loss / total_examples}")


In [None]:
epochs = 50
method = 1
for epoch in range(epochs):
  train(model, train_dataloader, method)
  print('-' * 89)


In [None]:
def generateSample(dataset, query, sampleSize = 2000, maskMode = "o"):
    sample = dataset.sample(sampleSize)
    ret = []
    for _, row in sample.iterrows():
      data = query
      i = ["s", "r", "o"].index(maskMode)
      data[i] = row[i]
      ret.append(data)
    return np.array(ret)

@torch.no_grad()
def getTopK(dataset, query, maskMode = "o", K=10, sampleSize = 2000):
    model.eval()
    data = generateSample(dataset, query, sampleSize, maskMode)
    dataFrame = pd.DataFrame({'s': data[:, 0], 'r': data[:, 1], 'o': data[:, 2]})
    dataset = CustomDataset(dataFrame,vocab)
    dataloader = DataLoader(dataset, batch_size = sampleSize, num_workers=2, shuffle=False)
    # print("load", data.shape)
    out = None
    # num batches is just one
    for batchIndex, dataBatch in enumerate(dataloader):
      x = dataBatch["x"]
      y = dataBatch["y"]
      out = model(x, None, 1)
      # out = out.detach().cpu().numpy()
      # x = x.detach().cpu().numpy()
      # print("x", x.shape)
      # print("out", out.shape)
      scores = []
      for idx in range(out.shape[0]):
        maskedIndex = 1 if maskMode == "s" else 5
        scores.append([out[idx][x[idx][maskedIndex]]])
    scores = np.array(scores)
    topK = np.argsort(-scores, axis=0)[:K, :]
    # print("top k",K, out.shape, topK.shape, scores.shape, np.argsort(-scores, axis=0).shape)
    topKDocs = [data[topK[i][0]] for i in range(topK.shape[0])]
    return topKDocs, dataFrame

def getGoldLabels(dataset, query, sampledDataset, maskMode = "o"):
    ret = []
    for _, doc in sampledDataset.iterrows():
      if(maskMode == "o"):
        ret.append(1 if(dataset["s"] == doc["s"] and dataset["r"] == doc["r"]) else 0)
      elif (maskMode == "s"):
        ret.append(1 if(dataset["r"] == doc["r"] and dataset["o"] == doc["o"]) else 0)
    return ret


def getPrecisionTillK(dataset, queries,  maskMode = "o", K=10):
    ret = [[0 for i in range(K)] for j in range(len(queries))]
    for i, query in queries.iterrows():
      for k in range(1, K+1):
        topK, sampledData = getTopK(dataset, query, maskMode, k)
        gold = getGoldLabels(dataset, query, sampledData, maskMode)
        goldTopK = [1 if gold[docID] else 0 for docID in topK]
        ret[i][k] = np.sum(goldTopK)/len(topK)
    return ret


def metrics(dataset, numQueries = 1000,  maskMode = "o", K=10, sampleSize = 2000):
    queries = dataset.sample(numQueries)
    ret = []
    recall = [0 for i in range(numQueries)]
    inverseRankOfFirstGold = [1 for i in range(numQueries)]
    precision = [0 for i in range(numQueries)]
    for i, query in queries.iterrows():
      topK, sampledData = getTopK(dataset, query, maskMode, K, sampleSize = sampleSize)
      gold = getGoldLabels(dataset, query, sampledData, maskMode)
      goldTopK = [1 if gold[docID] else 0 for docID in topK]
      recall[i] = np.sum(goldTopK)/np.sum(gold)
      presicionTillK = getPrecisionTillK(dataset, queries, maskMode, K)
      precision[i] = np.sum(goldTopK)/len(topK)
      inverseRankOfFirstGold[i] = 1/(np.array(goldTopK).toList().index(1) + 1)

    hits = np.mean(precision)
    mrr = np.mean(inverseRankOfFirstGold)
    queryType = "sr?" if maskMode == "o" else "?ro"
    print(f"queryType: {queryType} K:{K} | hits: {hits} | mrr: {mrr}")
    return hits, mrr, map


In [None]:
for K in [1,10]:
  for maskMode in ["o", "s"]:
    t = metrics(test_dataset, numQueries = 1000, maskMode = maskMode, K = 10, sampleSize = 2000)
