In [None]:
# ðŸ“¦ Install dependencies (only run once per environment)
%pip install torch torch_geometric pandas scikit-learn tqdm --quiet

In [1]:
import os.path as osp
import os

import torch
from tqdm import tqdm

from torch_geometric.datasets import MovieLens100K
from torch_geometric.nn import LightGCN, GraphSAGE, GAT
from torch_geometric.utils import degree

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
print(f'Using device: {device}')

path = osp.join(os.getcwd(), '..', 'data', 'PyG-ML100K')
path = osp.abspath(path)  # Optional: gets the absolute normalized path

dataset = MovieLens100K(path)
data = dataset[0]
num_users = data['user'].num_nodes
num_movies = data['movie'].num_nodes # Consistent naming with MovieLens
data = data.to_homogeneous()
data = data.to(device)


# Use all message passing edges as training labels:
batch_size = 2048
mask = data.edge_index[0] < data.edge_index[1]
train_edge_label_index = data.edge_index[:, mask]
train_loader = torch.utils.data.DataLoader(
    range(train_edge_label_index.size(1)),
    shuffle=True,
    batch_size=batch_size,
)

# Graph Recommendation Model: LightGCN, GraphSAGE, GAT.
model = LightGCN(
    num_nodes=data.num_nodes,
    embedding_dim=64,
    num_layers=2,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


def train():
    total_loss = total_examples = 0

    for index in tqdm(train_loader):
        # Sample positive and negative labels.
        pos_edge_label_index = train_edge_label_index[:, index]
        neg_edge_label_index = torch.stack([
            pos_edge_label_index[0],
            torch.randint(0, data.num_nodes, (index.numel(), ), device=device)
        ], dim=0)
        edge_label_index = torch.cat([
            pos_edge_label_index,
            neg_edge_label_index,
        ], dim=1)

        optimizer.zero_grad()
        pos_rank, neg_rank = model(data.edge_index, edge_label_index).chunk(2)

        loss = model.recommendation_loss(
            pos_rank,
            neg_rank,
            node_id=edge_label_index.unique(),
        )
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * pos_rank.numel()
        total_examples += pos_rank.numel()

    return total_loss / total_examples


@torch.no_grad()
def test(k: int):
    emb = model.get_embedding(data.edge_index)
    user_emb, book_emb = emb[:num_users], emb[num_users:]

    precision = recall = total_examples = 0
    for start in range(0, num_users, batch_size):
        end = start + batch_size
        logits = user_emb[start:end] @ book_emb.t()

        # Exclude training edges:
        mask = ((train_edge_label_index[0] >= start) &
                (train_edge_label_index[0] < end))
        logits[train_edge_label_index[0, mask] - start,
               train_edge_label_index[1, mask] - num_users] = float('-inf')

        # Computing precision and recall:
        ground_truth = torch.zeros_like(logits, dtype=torch.bool)
        mask = ((data.edge_label_index[0] >= start) &
                (data.edge_label_index[0] < end))
        ground_truth[data.edge_label_index[0, mask] - start,
                     data.edge_label_index[1, mask] - num_users] = True
        node_count = degree(data.edge_label_index[0, mask] - start,
                            num_nodes=logits.size(0))

        topk_index = logits.topk(k, dim=-1).indices
        isin_mat = ground_truth.gather(1, topk_index)

        precision += float((isin_mat.sum(dim=-1) / k).sum())
        recall += float((isin_mat.sum(dim=-1) / node_count.clamp(1e-6)).sum())
        total_examples += int((node_count > 0).sum())

    return precision / total_examples, recall / total_examples


for epoch in range(1, 101):
    loss = train()
    precision, recall = test(k=10)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Precision@10: '
          f'{precision:.4f}, Recall@10: {recall:.4f}')

Using device: cpu


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 40/40 [00:01<00:00, 33.17it/s]


IndexError: index 948 is out of bounds for dimension 0 with size 943

In [2]:
import os.path as osp
import os

import torch
from tqdm import tqdm

from torch_geometric.datasets import MovieLens100K
from torch_geometric.nn import LightGCN, GraphSAGE, GAT
from torch_geometric.utils import degree

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

path = osp.join(os.getcwd(), '..', 'data', 'PyG-ML100K')
path = osp.abspath(path)

dataset = MovieLens100K(path)
data = dataset[0]

# Separate num_users and num_movies before converting to homogeneous graph
num_users = data['user'].num_nodes
num_movies = data['movie'].num_nodes

# Convert to homogeneous graph AFTER extracting individual node counts
data = data.to_homogeneous()
data = data.to(device)

# Use all message passing edges as training labels:
batch_size = 8096
mask = data.edge_index[0] < data.edge_index[1]
train_edge_label_index = data.edge_index[:, mask]
train_loader = torch.utils.data.DataLoader(
    range(train_edge_label_index.size(1)),
    shuffle=True,
    batch_size=batch_size,
)

# Graph Recommendation Model: LightGCN, GraphSAGE, GAT.
model = LightGCN(
    num_nodes=data.num_nodes,
    embedding_dim=64,
    num_layers=2,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


def train():
    total_loss = total_examples = 0

    for index in tqdm(train_loader):
        # Sample positive and negative labels.
        pos_edge_label_index = train_edge_label_index[:, index]
        neg_edge_label_index = torch.stack([
            pos_edge_label_index[0],
            # Ensure negative samples are correctly within the range of all nodes
            torch.randint(0, data.num_nodes, (index.numel(), ), device=device)
        ], dim=0)
        edge_label_index = torch.cat([
            pos_edge_label_index,
            neg_edge_label_index,
        ], dim=1)

        optimizer.zero_grad()
        pos_rank, neg_rank = model(data.edge_index, edge_label_index).chunk(2)

        loss = model.recommendation_loss(
            pos_rank,
            neg_rank,
            node_id=edge_label_index.unique(),
        )
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * pos_rank.numel()
        total_examples += pos_rank.numel()

    return total_loss / total_examples


@torch.no_grad()
def test(k: int):
    emb = model.get_embedding(data.edge_index)
    user_emb, movie_emb = emb[:num_users], emb[num_users:]

    precision = recall = total_examples = 0
    for start in range(0, num_users, batch_size):
        end = min(start + batch_size, num_users) # Ensure 'end' does not exceed num_users
        current_batch_size = end - start # Actual size of the current user batch
        logits = user_emb[start:end] @ movie_emb.t()

        # Exclude training edges:
        mask_train = ((train_edge_label_index[0] >= start) &
                      (train_edge_label_index[0] < end))
        
        # Ensure indices for 'logits' are within bounds of the current batch
        # train_edge_label_index[0, mask_train] are the user IDs in the homogeneous graph
        # train_edge_label_index[1, mask_train] are the item IDs in the homogeneous graph
        # We need to adjust these to be relative to the current 'logits' tensor
        relative_user_indices_train = train_edge_label_index[0, mask_train] - start
        relative_movie_indices_train = train_edge_label_index[1, mask_train] - num_users

        # Check for valid indices before assignment to prevent IndexError
        valid_mask_train_rows = (relative_user_indices_train >= 0) & (relative_user_indices_train < current_batch_size)
        valid_mask_train_cols = (relative_movie_indices_train >= 0) & (relative_movie_indices_train < num_movies)
        valid_mask_train = valid_mask_train_rows & valid_mask_train_cols

        # Apply the mask to only the valid indices
        logits[relative_user_indices_train[valid_mask_train],
               relative_movie_indices_train[valid_mask_train]] = float('-inf')

        # Computing precision and recall:
        ground_truth = torch.zeros_like(logits, dtype=torch.bool)
        mask_gt = ((data.edge_label_index[0] >= start) &
                   (data.edge_label_index[0] < end))
        
        # Similar index adjustment and validation for ground_truth
        relative_user_indices_gt = data.edge_label_index[0, mask_gt] - start
        relative_movie_indices_gt = data.edge_label_index[1, mask_gt] - num_users

        valid_mask_gt_rows = (relative_user_indices_gt >= 0) & (relative_user_indices_gt < current_batch_size)
        valid_mask_gt_cols = (relative_movie_indices_gt >= 0) & (relative_movie_indices_gt < num_movies)
        valid_mask_gt = valid_mask_gt_rows & valid_mask_gt_cols

        ground_truth[relative_user_indices_gt[valid_mask_gt],
                     relative_movie_indices_gt[valid_mask_gt]] = True
        
        # node_count calculation also needs to use the relative user indices
        node_count = degree(relative_user_indices_gt[valid_mask_gt],
                            num_nodes=logits.size(0))

        topk_index = logits.topk(k, dim=-1).indices
        isin_mat = ground_truth.gather(1, topk_index)

        precision += float((isin_mat.sum(dim=-1) / k).sum())
        recall += float((isin_mat.sum(dim=-1) / node_count.clamp(1e-6)).sum())
        total_examples += int((node_count > 0).sum())

    return precision / total_examples, recall / total_examples


for epoch in range(1, 101):
    loss = train()
    precision, recall = test(k=10)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Precision@10: '
          f'{precision:.4f}, Recall@10: {recall:.4f}')

Using device: cuda


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10/10 [00:00<00:00, 43.57it/s]


ZeroDivisionError: float division by zero