In [None]:
%pip install dhg

In [9]:
import time
from copy import deepcopy

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from dhg import BiGraph
from dhg.data import MovieLens1M, BaseData
from dhg.models import NGCF, DHCF
from dhg.nn import BPRLoss, EmbeddingRegularization
from dhg.metrics import UserItemRecommenderEvaluator as Evaluator
from dhg.random import set_seed
from dhg.utils import UserItemDataset, adj_list_to_edge_list

In [22]:
class BPR_Reg(nn.Module):
    def __init__(self, weight_decay):
        super().__init__()
        self.reg = EmbeddingRegularization(p=2, weight_decay=weight_decay)
        self.bpr = BPRLoss(activation="softplus")

    def forward(self, emb_users, emb_items, users, pos_items, neg_items, raw_emb_users, raw_emb_items):
        cur_u = emb_users[users]
        cur_pos_i, cur_neg_i = emb_items[pos_items], emb_items[neg_items]
        pos_scores, neg_scores = (cur_u * cur_pos_i).sum(dim=1), (cur_u * cur_neg_i).sum(dim=1)
        loss_bpr = self.bpr(pos_scores, neg_scores)
        raw_u = raw_emb_users[users]
        raw_pos_i, raw_neg_i = raw_emb_items[pos_items], raw_emb_items[neg_items]
        loss_l2 = self.reg(raw_u, raw_pos_i, raw_neg_i)
        loss = loss_bpr + loss_l2

        return loss


def train(net, data_loader, optimizer, criterion, epoch):
    net.train()

    loss_mean, st = 0, time.time()
    for users, pos_items, neg_items in data_loader:
        users, pos_items, neg_items = users.to(device), pos_items.to(device), neg_items.to(device)
        optimizer.zero_grad()
        emb_users, emb_items = net(ui_bigraph)
        loss = criterion(
            emb_users, emb_items, users, pos_items, neg_items, net.u_embedding.weight, net.i_embedding.weight,
        )
        loss.backward()
        optimizer.step()
        loss_mean += loss.item() * users.shape[0]
    loss_mean /= len(data_loader.dataset)
    print(f"Epoch: {epoch}, Time: {time.time()-st:.5f}s, Loss: {loss_mean:.5f}")


@torch.no_grad()
def validate(net, data_loader):
    net.eval()

    for users, train_mask, true_rating in data_loader:
        users, train_mask, true_rating = (
            users.to(device),
            train_mask.to(device),
            true_rating.to(device),
        )

        emb_users, emb_items = net(ui_bigraph)
        pred_rating = emb_users[users] @ emb_items.t()
        pred_rating += train_mask
        evaluator.validate_add_batch(true_rating, pred_rating)
    return evaluator.validate_epoch_res()


@torch.no_grad()
def test(net, data_loader):
    net.eval()

    for users, train_mask, true_rating in data_loader:
        users, train_mask, true_rating = (
            users.to(device),
            train_mask.to(device),
            true_rating.to(device),
        )
        emb_users, emb_items = net(ui_bigraph)
        pred_rating = emb_users[users] @ emb_items.t()
        pred_rating += train_mask
        evaluator.test_add_batch(true_rating, pred_rating)
    return evaluator.test_epoch_res()

In [3]:
%pip install pandas

Collecting pandas
  Using cached pandas-2.3.1-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.3.1-cp310-cp310-win_amd64.whl (11.3 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas

   ---------------------------------------- 0/3 [pytz]
   ------------- -------------------------- 1/3 [tzdata]
   ------------- -------------------------- 1/3 [tzdata]
   -------------------------- ------------- 2/3 [pandas]
   -------------------------- ------------- 2/3 [pandas]
   -------------------------- ------------- 2/3 [pandas]
   -------------------------- ------------- 2/3 [pandas]
   -------------------------- ------------- 2/3 [pandas]
   ----------------

In [29]:
# Construct dataset 100k-ml
import pandas as pd

ratings = pd.read_csv('u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp'])
ratings = ratings.drop(columns='timestamp')

hypergraph_edge_list = []
for index, row in ratings.iterrows():
    # Each hyperedge is a list containing the nodes it connects.
    # In this case, a user node and an item node.
    hypergraph_edge_list.append([row['user'], row['item']])

In [None]:
from recbole.quick_start import run_recbole

run_recbole(model='NGCF', dataset='ml-100k')

22 Jul 18:27    INFO  ['c:\\Users\\Sharjeel Mustafa\\miniconda3\\envs\\recbole\\lib\\site-packages\\ipykernel_launcher.py', '--f="c:\\Users\\Sharjeel Mustafa\\AppData\\Roaming\\jupyter\\runtime\\kernel-v330dc9fb5782ed561f16cdee0e7a5dde0a6f5d218.json"']
22 Jul 18:27    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = C:\Users\Sharjeel Mustafa\miniconda3\envs\recbole\Lib\site-packages\recbole\config\../dataset_example/ml-100k
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 300
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Pa

In [31]:
hypergraph_edge_list = ratings.groupby('user')['item'].apply(list).apply(lambda x: [x.name] + x).tolist()
print(hypergraph_edge_list)

AttributeError: 'list' object has no attribute 'name'

In [5]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

ratings['user'] = user_encoder.fit_transform(ratings['user'])
ratings['item'] = item_encoder.fit_transform(ratings['item'])

num_users = ratings['user'].nunique()
num_items = ratings['item'].nunique()

In [6]:
ratings = ratings.sort_values(by=['user'])  # if timestamp available
test_data = ratings.groupby('user').tail(1)
train_data = ratings.drop(test_data.index)

In [10]:
import scipy.sparse as sp
import numpy as np

def build_incidence_matrix(train_data, num_users, num_items):
    rows = []
    cols = []
    for uid in range(num_users):
        items = train_data[train_data['user'] == uid]['item'].values
        rows.extend(items)
        cols.extend([uid] * len(items))  # one hyperedge per user
    values = np.ones(len(rows))
    H_iu = sp.coo_matrix((values, (rows, cols)), shape=(num_items, num_users))  # items Ã hyperedges
    H_ui = sp.coo_matrix((values, (cols, rows)), shape=(num_users, num_items))
    print(num_items)
    print(num_users)
    return H_iu, H_ui

H, T = build_incidence_matrix(train_data, num_users, num_items)

1682
943


In [28]:
def make_edge_list(train_data):
    """
    Convert train_data with columns 'user' and 'item' into an edge list
    where each element is (user, item).
    """
    users = train_data['user'].values
    items = train_data['item'].values
    edge_list = list(zip(users, items))
    return edge_list

E = make_edge_list(train_data)
print(E)

[(np.int64(0), np.int64(27)), (np.int64(0), np.int64(202)), (np.int64(0), np.int64(74)), (np.int64(0), np.int64(116)), (np.int64(0), np.int64(195)), (np.int64(0), np.int64(78)), (np.int64(0), np.int64(51)), (np.int64(0), np.int64(110)), (np.int64(0), np.int64(242)), (np.int64(0), np.int64(90)), (np.int64(0), np.int64(146)), (np.int64(0), np.int64(66)), (np.int64(0), np.int64(212)), (np.int64(0), np.int64(154)), (np.int64(0), np.int64(114)), (np.int64(0), np.int64(91)), (np.int64(0), np.int64(50)), (np.int64(0), np.int64(232)), (np.int64(0), np.int64(268)), (np.int64(0), np.int64(29)), (np.int64(0), np.int64(169)), (np.int64(0), np.int64(43)), (np.int64(0), np.int64(175)), (np.int64(0), np.int64(145)), (np.int64(0), np.int64(104)), (np.int64(0), np.int64(8)), (np.int64(0), np.int64(25)), (np.int64(0), np.int64(222)), (np.int64(0), np.int64(259)), (np.int64(0), np.int64(245)), (np.int64(0), np.int64(15)), (np.int64(0), np.int64(52)), (np.int64(0), np.int64(125)), (np.int64(0), np.int64(8

In [24]:
from typing import Optional
class ML100K(BaseData):
    def __init__(self, data_root: Optional[str] = None) -> None:
        super().__init__("ml_100k", data_root)
        self._content = {
            "num_vertices": 1682,
            "num_edges": 943,
            "edge_list": H
        }

In [25]:
dataset = ML100K()
print(dataset['edge_list'])

<COOrdinate sparse matrix of dtype 'float64'
	with 99057 stored elements and shape (1682, 943)>
  Coords	Values
  (27, 0)	1.0
  (202, 0)	1.0
  (74, 0)	1.0
  (116, 0)	1.0
  (195, 0)	1.0
  (78, 0)	1.0
  (51, 0)	1.0
  (110, 0)	1.0
  (242, 0)	1.0
  (90, 0)	1.0
  (146, 0)	1.0
  (66, 0)	1.0
  (212, 0)	1.0
  (154, 0)	1.0
  (114, 0)	1.0
  (91, 0)	1.0
  (50, 0)	1.0
  (232, 0)	1.0
  (268, 0)	1.0
  (29, 0)	1.0
  (169, 0)	1.0
  (43, 0)	1.0
  (175, 0)	1.0
  (145, 0)	1.0
  (104, 0)	1.0
  :	:
  (430, 942)	1.0
  (201, 942)	1.0
  (283, 942)	1.0
  (183, 942)	1.0
  (126, 942)	1.0
  (470, 942)	1.0
  (228, 942)	1.0
  (172, 942)	1.0
  (613, 942)	1.0
  (545, 942)	1.0
  (567, 942)	1.0
  (75, 942)	1.0
  (138, 942)	1.0
  (400, 942)	1.0
  (1027, 942)	1.0
  (52, 942)	1.0
  (684, 942)	1.0
  (569, 942)	1.0
  (230, 942)	1.0
  (671, 942)	1.0
  (654, 942)	1.0
  (95, 942)	1.0
  (232, 942)	1.0
  (422, 942)	1.0
  (390, 942)	1.0


In [None]:

dim_emb = 64
lr = 0.001
num_workers = 0
batch_sz = 16384 
val_freq = 20
epoch_max = 100
weight_decay = 1e-4
set_seed(2022)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
evaluator = Evaluator([{"ndcg": {"k": 10}}, {"recall": {"k": 10}}, {"precision":{"k": 10}}])



num_users = data1._content["num_vertices"]
num_items = data1._content["num_edges"]
print(num_users)
# iu = DataLoader(data1, batch_size=batch_sz, shuffle=False, num_workers=num_workers)
# ui = DataLoader(data2, batch_size=batch_sz, shuffle=False, num_workers=num_workers)

# data = MovieLens1M()
# num_u, num_i = data["num_users"], data["num_items"]
# train_adj_list = data["train_adj_list"]
# test_adj_list = data["test_adj_list"]
# ui_bigraph = BiGraph.from_adj_list(num_u, num_i, train_adj_list)
# ui_bigraph = ui_bigraph.to(device)
# train_edge_list = adj_list_to_edge_list(train_adj_list)
# test_edge_list = adj_list_to_edge_list(test_adj_list)
# train_dataset = UserItemDataset(num_u, num_i, train_edge_list)
# test_dataset = UserItemDataset(num_u, num_i, test_edge_list, train_user_item_list=train_edge_list, phase="test")
# train_loader = DataLoader(train_dataset, batch_size=batch_sz, shuffle=True, num_workers=num_workers)
# test_loader = DataLoader(test_dataset, batch_size=batch_sz, shuffle=False, num_workers=num_workers)
net = DHCF(num_users, num_items, dim_emb)
net = net.to(device)
criterion = BPR_Reg(weight_decay)
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

best_state, best_val, best_epoch = None, 0, -1
for epoch in range(epoch_max):
    train(net, train_loader, optimizer, criterion, epoch)
    if epoch % val_freq == 0:
        val_res = validate(net, test_loader)
        print(f"Validation: NDCG@10 -> {val_res}")
        if val_res > best_val:
            best_epoch = epoch
            best_val = val_res
            best_state = deepcopy(net.state_dict())
            
# Final Outputs
print("train finished")
print(f"best val: {best_val}")
print(f"best epoch: {best_epoch}")
print("testing...")
net.load_state_dict(best_state)
test_res = test(net, test_loader)
print(f"test res: {test_res}")

cuda
1682


NameError: name 'train_loader' is not defined