In [None]:
!pip install cornac

In [None]:
from cornac.data import Dataset
from cornac.models import VBPR
from cornac.eval_methods import BaseMethod
from cornac.metrics import Recall, NDCG, AUC, MAP
import numpy as np
import pandas as pd
import json

In [None]:
!wget https://www.dropbox.com/s/57tel5zqopkssrh/books.csv?dl=0 -O books.csv
!wget https://www.dropbox.com/s/dqeqpsr0vdvmcy0/goodreads_past_interactions.json?dl=0 -O goodreads_past_interactions.json
!wget https://www.dropbox.com/s/rjtzhmb2zbpp30q/goodreads_test_interactions.json?dl=0 -O goodreads_test_interactions.json

In [None]:
df_books = pd.read_csv('books.csv', sep=',')

idx2bookid = {i: id_ for i, id_ in enumerate(df_books.book_id)}
bookid2idx = {id_:i for i, id_ in enumerate(df_books.book_id)}

with open("goodreads_past_interactions.json", "r") as f:
    user_interactions = json.load(f)

idx2userid = {i: id_ for i, id_ in enumerate(user_interactions.keys())}
userid2idx = {id_:i for i, id_ in enumerate(user_interactions.keys())}

with open("goodreads_test_interactions.json", "r") as f:
    user_interactions_test = json.load(f)

df_books = pd.read_csv('books.csv', sep=',')

idx2bookid = {i: id_ for i, id_ in enumerate(df_books.book_id)}
bookid2idx = {id_:i for i, id_ in enumerate(df_books.book_id)}

with open("goodreads_past_interactions.json", "r") as f:
    user_interactions = json.load(f)

idx2userid = {i: id_ for i, id_ in enumerate(user_interactions.keys())}
userid2idx = {id_:i for i, id_ in enumerate(user_interactions.keys())}

with open("goodreads_test_interactions.json", "r") as f:
    user_interactions_test = json.load(f)

In [None]:
# Step 1: Prepare train interactions as (user_idx, item_idx)
train_data = []
for user_id, book_ids in user_interactions.items():
    uidx = userid2idx[user_id]
    for book_id in book_ids:
        iidx = bookid2idx[book_id]
        train_data.append((uidx, iidx, 1)) # rating=1

# Step 2: Prepare test interactions
test_data = []
for user_id, book_ids in user_interactions_test.items():
    uidx = userid2idx.get(user_id)
    if uidx is None:
        continue
    for book_id in book_ids:
        iidx = bookid2idx.get(book_id)
        if iidx is not None:
            test_data.append((uidx, iidx, 1)) # rating=1


In [None]:
#img_embeddings = np.load("imgs_features_sorted.npy")
#bert_embeddings = np.load("goodreads_bert_embeddings.npy")
#book_embeddings = np.concatenate([img_embeddings, bert_embeddings], axis=1)
book_embeddings = np.load("pca_800_book_combined_sorted.npy")
book_embeddings /= np.linalg.norm(book_embeddings, axis=1, keepdims=True) + 1e-8

In [None]:
from cornac.data import FeatureModality, ImageModality
from cornac.eval_methods import BaseMethod
import numpy as np

# Step 1: Create the item feature modality
item_feature_modality = FeatureModality(
    features=book_embeddings.astype(np.float32),  # PCA, BERT, etc.
    #ids=[idx2bookid[i] for i in range(len(idx2bookid))],
    normalized=True
)

# Step 1.5: create image modality from the features
item_image_modality = ImageModality(
    features=item_feature_modality.features,
    ids=item_feature_modality.ids,
    normalized=True
)

# Step 2: Create eval method using this modality
eval_method = BaseMethod.from_splits(
    train_data=train_data,
    test_data=test_data,
    item_image=item_image_modality,
    verbose=True,
    rating_threshold=1.0,
    exclude_unknowns=True
)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 52821
Number of items = 4287
Number of ratings = 3354523
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 52821
Number of items = 4287
Number of ratings = 1000
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 52821
Total items = 4287


In [None]:
from cornac.models import VBPR

In [None]:
from cornac.models import VBPR
import torch

model = VBPR(
    k=32,                   # MF latent dim
    k2=64,                  # Visual latent dim
    n_epochs=7,
    batch_size=2048,
    learning_rate=0.004,
    use_gpu=torch.cuda.is_available()
)

#model.fit(eval_method.train_set)


In [None]:
from cornac import Experiment
from cornac.metrics import Precision, Recall, NDCG, AUC, MAP
import os

save_path = os.path.join(os.getcwd(), "results")

metrics = [Precision(k=5), Precision(k=10), Precision(k=20), NDCG(k=5), NDCG(k=10), NDCG(k=20), AUC(), MAP()]

Experiment(save_dir=save_path, eval_method=eval_method,
                  models=[model],
                  metrics=metrics).run()



[VBPR] Training started!


Epoch 1/7:   0%|          | 0/1638 [00:00<?, ?it/s]

Epoch 2/7:   0%|          | 0/1638 [00:00<?, ?it/s]

Epoch 3/7:   0%|          | 0/1638 [00:00<?, ?it/s]

Epoch 4/7:   0%|          | 0/1638 [00:00<?, ?it/s]

Epoch 5/7:   0%|          | 0/1638 [00:00<?, ?it/s]

Epoch 6/7:   0%|          | 0/1638 [00:00<?, ?it/s]

Epoch 7/7:   0%|          | 0/1638 [00:00<?, ?it/s]

Optimization finished!

[VBPR] Evaluation started!


Ranking:   0%|          | 0/100 [00:00<?, ?it/s]

VBPR model is saved to /content/results/VBPR/2025-07-01_17-24-08-294683.pkl

TEST:
...
     |    AUC |    MAP | NDCG@10 | NDCG@20 | NDCG@5 | Precision@10 | Precision@20 | Precision@5 | Train (s) | Test (s)
---- + ------ + ------ + ------- + ------- + ------ + ------------ + ------------ + ----------- + --------- + --------
VBPR | 0.9501 | 0.1181 |  0.1381 |  0.1797 | 0.1451 |       0.1280 |       0.1020 |      0.1320 |  688.8749 |   0.2581



## BERT

         |    AUC |    MAP | NDCG@10 | NDCG@20 | NDCG@5 | Precision@10 | Precision@20 | Precision@5 | Train (s) | Test (s)
    ---- + ------ + ------ + ------- + ------- + ------ + ------------ + ------------ + ----------- + --------- + --------
    VBPR | 0.9475 | 0.1119 |  0.1303 |  0.1726 | 0.1441 |       0.1210 |       0.0990 |      0.1380 |  432.6284 |   0.8530

## VGG16

         |    AUC |    MAP | NDCG@10 | NDCG@20 | NDCG@5 | Precision@10 | Precision@20 | Precision@5 | Train (s) | Test (s)
    ---- + ------ + ------ + ------- + ------- + ------ + ------------ + ------------ + ----------- + --------- + --------
    VBPR | 0.9513 | 0.1178 |  0.1346 |  0.1819 | 0.1490 |       0.1240 |       0.1050 |      0.1400 |  445.0591 |   0.3273

## PCA 800

         |    AUC |    MAP | NDCG@10 | NDCG@20 | NDCG@5 | Precision@10 | Precision@20 | Precision@5 | Train (s) | Test (s)
    ---- + ------ + ------ + ------- + ------- + ------ + ------------ + ------------ + ----------- + --------- + --------
    VBPR | 0.9515 | 0.1186 |  0.1369 |  0.1837 | 0.1557 |       0.1270 |       0.1055 |      0.1500 |  442.5003 |   0.2028


In [None]:
from scipy.sparse import coo_matrix
rows = []
cols = []
data = []

for user_id, book_ids in user_interactions.items():
    for book_id in book_ids:
        u_id = userid2idx[user_id]
        b_id = bookid2idx[book_id]
        rows.append(int(u_id))
        cols.append(int(b_id))
        data.append(1)

max_user_id = max(rows)
max_item_id = max(cols)

# Crear matriz sparse
user_item_matrix_train = coo_matrix((data, (rows, cols)), shape=(max_user_id + 1, max_item_id + 1))
print(user_item_matrix_train.shape)

rows_test = []
cols_test = []
data_test = []

for user_id, book_ids in user_interactions_test.items():
    for book_id in book_ids:
        u_id = userid2idx[user_id]
        b_id = bookid2idx[book_id]
        rows_test.append(int(u_id))
        cols_test.append(int(b_id))
        data_test.append(1)

max_user_id = max([max_user_id] + rows_test)
max_item_id = max([max_item_id] + cols_test)

test_users = list(set(rows_test))

# Crear matriz sparse
user_item_matrix_test = coo_matrix((data_test, (rows_test, cols_test)), shape=(max_user_id + 1, max_item_id + 1))
print(user_item_matrix_test.shape)

(52821, 4287)
(52821, 4287)


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_ild_at_k(model, user_item_matrix_train, user_interactions_test,
                     userid2idx, book_embeddings, K=10):
    """
    Compute ILD@K for Cornac VBPR using internal indices.

    Parameters:
    -----------
    model : trained Cornac VBPR model
    user_item_matrix_train : csr_matrix
    user_interactions_test : dict {external_user_id: list of book_ids}
    userid2idx : dict {external_user_id: internal_user_idx}
    book_embeddings : np.ndarray [num_books x embedding_dim]
    K : int

    Returns:
    --------
    float : Average ILD@K across test users
    """
    ild_scores = []

    for user_id in user_interactions_test:
        if user_id not in userid2idx:
            continue

        user_idx = userid2idx[user_id]

        # Recommend top-K internal book indices
        recommended = model.recommend(user_idx, k=K)
        if len(recommended) < 2:
            continue

        emb = book_embeddings[recommended]
        sim_matrix = cosine_similarity(emb)
        upper = np.triu_indices_from(sim_matrix, k=1)
        pairwise_sims = sim_matrix[upper]
        ild = np.mean(1 - pairwise_sims)
        ild_scores.append(ild)

    return np.mean(ild_scores) if ild_scores else 0.0


In [None]:
ks = [5, 10, 20]

for k in ks:
  print(f"ILD@{k}: {compute_ild_at_k(model, user_item_matrix_train, user_interactions_test, userid2idx, book_embeddings, K=k)}")



ILD@5: 0.9407193445612455
ILD@10: 0.9536970436213172
ILD@20: 0.9615498687261854


ILD@5: 0.9537838793579136
ILD@10: 0.9562709182431357
ILD@20: 0.9709029511553376

In [None]:
import numpy as np

def novelty_at_k(topk_items, pop_counter, max_pop, k):
    pops = np.array([pop_counter.get(i, 0) for i in topk_items[:k]], dtype=np.float32)
    return np.mean(1.0 - pops / max_pop)


def average_novelty(model, user_interactions_test, userid2idx,
                    pop_counter, max_pop, k=10):
    """
    Compute Novelty@K using internal indices.

    Parameters:
    -----------
    model : trained Cornac VBPR model
    user_interactions_test : dict {external_user_id: list of book_ids}
    userid2idx : dict {external_user_id: internal_user_idx}
    pop_counter : Counter {book_idx: count}
    max_pop : int
    k : int

    Returns:
    --------
    float : Average Novelty@K
    """
    novelty_scores = []

    for user_id in user_interactions_test:
        if user_id not in userid2idx:
            continue

        user_idx = userid2idx[user_id]
        recommended = model.recommend(user_idx, k=k)

        novelty = novelty_at_k(recommended, pop_counter, max_pop, k)
        novelty_scores.append(novelty)

    return np.mean(novelty_scores) if novelty_scores else 0.0


In [None]:
from collections import Counter

# Count frequency of external book_ids in training data
all_train_pairs = [(u, i) for u, i_list in user_interactions.items() for i in i_list]
pop_counter = Counter(i for _, i in all_train_pairs)
max_pop = max(pop_counter.values())


In [None]:
for k in ks:
  print(f"Novelty@{k}: {average_novelty(model, user_interactions_test, userid2idx, pop_counter, max_pop, k=k)}")

Novelty@5: 0.5914559960365295
Novelty@10: 0.6145246624946594
Novelty@20: 0.6508187651634216


Novelty@5: 0.4632469415664673
Novelty@10: 0.4868226647377014
Novelty@20: 0.5151946544647217