<a href="https://colab.research.google.com/github/SeokwonHan02/2023-DSAIL-internship/blob/main/cs471_bpr_linkpred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Modules

In [4]:
import os
import random
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import ConfusionMatrixDisplay
from typing import Tuple, List
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset

In [2]:
SEED = 717
def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms(True)
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
set_seed(SEED)

# Load Dataset

In [5]:
from google.colab import files
f = files.upload()

Saving Cora.zip to Cora.zip


In [6]:
!unzip Cora.zip

Archive:  Cora.zip
  inflating: Cora/edge.pt            
  inflating: Cora/feat.pt            
  inflating: Cora/label.pt           


In [7]:
# 데이터셋 로드
features: torch.Tensor = torch.load('./Cora/feat.pt')
edges_raw: torch.Tensor = torch.load('./Cora/edge.pt')
labels: torch.Tensor = torch.load('./Cora/label.pt')

num_nodes = features.shape[0]

features = torch.eye(num_nodes)

feat_dim = features.shape[1]
num_edges = edges_raw.shape[1]
num_classes = labels.max().item()+1

print("Number of Total Nodes : ", num_nodes)
print("Number of Total Edges : ", num_edges)
print("Dimension of Node Features : ", feat_dim)
print("Number of Classes : ", num_classes)

Number of Total Nodes :  2708
Number of Total Edges :  10556
Dimension of Node Features :  2708
Number of Classes :  7


In [8]:
indices = torch.randperm(num_edges)
edges = edges_raw[:, indices]

train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

num_train = int(train_ratio * num_edges)
num_val = int(val_ratio * num_edges)
num_test = num_edges - num_train - num_val

train_edges = edges[:, :num_train]
val_edges = edges[:, num_train:num_train + num_val]
test_edges = edges[:, num_train + num_val:]

print("Number of Training Edges: ", train_edges.shape[1])
print("Number of Validation Edges: ", val_edges.shape[1])
print("Number of Test Edges: ", test_edges.shape[1])

Number of Training Edges:  8444
Number of Validation Edges:  1055
Number of Test Edges:  1057


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [10]:
all_node = torch.arange(0, num_nodes)
num_neg = 50

def triplet(num_node, edges):
  D_S = []

  for node1 in range(num_node):
    pos_sample = edges[0][edges[1] == node1]
    mask = ~torch.isin(all_node, pos_sample)
    neg_node = all_node[mask]

    rand_neg = torch.randperm(neg_node.size(0))[:num_neg]
    neg_sample = neg_node[rand_neg]

    for pos_node in pos_sample:
      for neg_node in neg_sample:
        D_S.append([node1, pos_node, neg_node])

  return torch.tensor(D_S)

D_S = triplet(num_nodes, train_edges)
print(D_S.shape)

torch.Size([422200, 3])


In [11]:
def dataload(D_S, batch_size):
    node = torch.tensor([t[0] for t in D_S])
    pos_node = torch.tensor([t[1] for t in D_S])
    neg_node = torch.tensor([t[2] for t in D_S])

    dataset = TensorDataset(node, pos_node, neg_node)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return dataloader

In [12]:
data_load = dataload(D_S, 1000)

In [22]:
# BPR 모델 및 임베딩 학습
class BPRModel(nn.Module):
    def __init__(self, num_nodes: int, emb_dim: int):
        super(BPRModel, self).__init__()
        self.embeddings = nn.Embedding(num_nodes, emb_dim)
        nn.init.xavier_uniform_(self.embeddings.weight.data)

    def forward(self, node_1 : int, node_2 : int):
        # 노드 쌍 인덱스에서 임베딩 추출

        node_1_embedding = self.embeddings(node_1)
        node_2_embedding = self.embeddings(node_2)

        prediction = torch.sum(node_1_embedding * node_2_embedding, dim = 1)

        return prediction


    def bpr_loss(self, node, pos_node, neg_node):
        node_embedding = self.embeddings(node)
        pos_embedding = self.embeddings(pos_node)
        neg_embedding = self.embeddings(neg_node)

        pos_scores = torch.sum(node_embedding * pos_embedding, dim=1)
        neg_scores = torch.sum(node_embedding * neg_embedding, dim=1)

        loss = -torch.mean(torch.log(torch.sigmoid(pos_scores - neg_scores)))

        return loss


# 훈련 함수에서 부정적인 샘플 생성 부분 수정
def train_bpr_model(model, data_load, lr=0.01):
    optimizer = optim.Adam(model.parameters(), lr=lr)

    model.train()
    total_loss = 0.0

    for batch in data_load:
        node, pos_node, neg_node = batch
        node = node.to(device)
        pos_node = pos_node.to(device)
        neg_node = neg_node.to(device)

        optimizer.zero_grad()
        loss = model.bpr_loss(node, pos_node, neg_node)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss

In [20]:
def compute_scores(out, edges):
        src = edges[0, :]
        dst = edges[1, :]
        scores = torch.sum(out[src] * out[dst], dim=1)
        return torch.sigmoid(scores).cpu().numpy()

def generate_neg_edges(pos_edges, num_nodes, num_neg_edges):
    existing_edges = set(tuple(edge) for edge in pos_edges.T)
    neg_edges = []

    while len(neg_edges) < num_neg_edges:
        node1 = np.random.randint(num_nodes)
        node2 = np.random.randint(num_nodes)

        if (node1, node2) not in existing_edges and (node2, node1) not in existing_edges and node1 != node2:
            neg_edges.append((node1, node2))

    return np.array(neg_edges).T

@torch.no_grad()
def test(model, features, train_edges, val_edges, test_edges):
    model.eval()
    out = model.embeddings.weight.data

    num_nodes = features.shape[0]
    num_neg_edges = len(train_edges[0])

    train_neg_edges = generate_neg_edges(train_edges, num_nodes, num_neg_edges)
    val_neg_edges = generate_neg_edges(np.hstack((train_edges.cpu(), val_edges.cpu())), num_nodes, num_neg_edges)
    test_neg_edges = generate_neg_edges(np.hstack((train_edges.cpu(), val_edges.cpu(), test_edges.cpu())), num_nodes, num_neg_edges)

    train_pos_scores = compute_scores(out, train_edges)
    train_neg_scores = compute_scores(out, train_neg_edges)

    val_pos_scores = compute_scores(out, val_edges)
    val_neg_scores = compute_scores(out, val_neg_edges)

    test_pos_scores = compute_scores(out, test_edges)
    test_neg_scores = compute_scores(out, test_neg_edges)

    train_labels = np.hstack([np.ones(len(train_pos_scores)), np.zeros(len(train_neg_scores))])
    train_scores = np.hstack([train_pos_scores, train_neg_scores])

    val_labels = np.hstack([np.ones(len(val_pos_scores)), np.zeros(len(val_neg_scores))])
    val_scores = np.hstack([val_pos_scores, val_neg_scores])

    test_labels = np.hstack([np.ones(len(test_pos_scores)), np.zeros(len(test_neg_scores))])
    test_scores = np.hstack([test_pos_scores, test_neg_scores])

    train_auc = roc_auc_score(train_labels, train_scores)
    val_auc = roc_auc_score(val_labels, val_scores)
    test_auc = roc_auc_score(test_labels, test_scores)

    return train_auc, val_auc, test_auc

def train(num_epochs, model, features, train_edges, val_edges, test_edges, data_load):

    best_val_auc = best_test_auc = 0
    best_epoch = 0
    train_aucs, valid_aucs, test_aucs = [], [], []

    with tqdm(range(1, num_epochs + 1), unit="epoch", desc="Training") as pbar:
        pbar.clear()
        for epoch in pbar:
            loss = train_bpr_model(model, data_load)  # Train BPRModel for 1 epoch
            train_auc, val_auc, test_auc = test(model, features, train_edges, val_edges, test_edges)

            train_aucs.append(train_auc)
            valid_aucs.append(val_auc)
            test_aucs.append(test_auc)

            if val_auc > best_val_auc:
                best_val_auc = val_auc
                best_test_auc = test_auc
                best_epoch = epoch
                torch.save(model.state_dict(), "best_model.pt")

            postfix_new = ", ".join([f"Auc. (Train): {train_auc:.4f}",
                                     f"Auc. (Val): {val_auc:.4f} (best: {best_val_auc:.4f})",
                                     f"Auc. (Test): {test_auc:.4f} (best: {best_test_auc:.4f})",
                                     f"loss: {loss:.4f}"])
            pbar.set_postfix_str(postfix_new)

    return best_epoch, best_val_auc, best_test_auc

In [23]:
set_seed(SEED)

model = BPRModel(num_nodes, 64).to(device)
features = features.to(device)
train_edges = train_edges.to(device)
val_edges = val_edges.to(device)
test_edges = test_edges.to(device)

num_epochs = 10
best_epoch, best_val_auc, best_test_aucs = train(num_epochs, model, features, train_edges, val_edges, test_edges, data_load)

Training: 100%|██████████| 10/10 [01:03<00:00,  6.31s/epoch, Auc. (Train): 0.9971, Auc. (Val): 0.9518 (best: 0.9603), Auc. (Test): 0.9642 (best: 0.9686), loss: 0.0375]
