In [14]:
import pandas as pd
import os
import zipfile
import torch
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
from tqdm import tqdm
import numpy as np

In [15]:
!wget https://www.dropbox.com/s/57tel5zqopkssrh/books.csv?dl=0 -O books.csv
!wget https://www.dropbox.com/s/zpnnoy1i8ljf9fg/goodreads_bert_embeddings.npy?dl=0 -O goodreads_bert_embeddings.npy
!wget https://www.dropbox.com/s/a8hcc9w30y7r3jl/goodreads_bert_large_embeddings.npy?dl=0 -O goodreads_bert_large_embeddings.npy
!wget https://www.dropbox.com/s/dqeqpsr0vdvmcy0/goodreads_past_interactions.json?dl=0 -O goodreads_past_interactions.json
!wget https://www.dropbox.com/s/rjtzhmb2zbpp30q/goodreads_test_interactions.json?dl=0 -O goodreads_test_interactions.json

--2025-06-06 23:23:55--  https://www.dropbox.com/s/57tel5zqopkssrh/books.csv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.4.18, 2620:100:601c:18::a27d:612
Connecting to www.dropbox.com (www.dropbox.com)|162.125.4.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.dropbox.com/scl/fi/5s6xrfnu17yi34sfhmskb/books.csv?rlkey=ymzokbyqw3qq2bq5okfao9w1z&dl=0 [following]
--2025-06-06 23:23:55--  https://www.dropbox.com/scl/fi/5s6xrfnu17yi34sfhmskb/books.csv?rlkey=ymzokbyqw3qq2bq5okfao9w1z&dl=0
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucdd0eb389c7c5e670a2d163d3c6.dl.dropboxusercontent.com/cd/0/inline/CrIgHvJHDbFF2zs6bJS7C9Ec-cRKpN4b37jCS9bBoTsr0JTVSo8CYf1sjwqpHsgJcI4UuRNaIAgfHHr8JKR7VFQOf8xZmi0bsqxTnU63FHrBl2RdS-tkrgPOHETuX_ufdhVwryhbbNGKvunnMkGi08aP/file# [following]
--2025-06-06 23:23:55--  https://ucdd0eb389c7c5e670a2d163d3c6.dl.dropboxusercontent.com/cd/0/in

In [16]:
# Unzip the covers
with zipfile.ZipFile("book_covers.zip", 'r') as zip_ref:
    zip_ref.extractall("book_covers")

In [17]:
books_df = pd.read_csv("books.csv")

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cargar ResNet50 y eliminar capa de clasificación
resnet = models.resnet50(pretrained=True)
resnet.fc = torch.nn.Identity()  # eliminar capa de clasificación
resnet = resnet.to(device)
resnet.eval()



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [19]:
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [20]:
image_folder = "book_covers/book_covers"
embeddings = []

for idx in tqdm(range(len(books_df))):
    image_path = os.path.join(image_folder, f"book_{idx}.jpg")

    try:
        image = Image.open(image_path).convert("RGB")
        image_tensor = transform(image).unsqueeze(0).to(device)
        with torch.no_grad():
            embedding = resnet(image_tensor)
        embeddings.append(embedding.cpu().numpy().squeeze())
    except Exception as e:
        print(f"Error with image {idx}: {e}")
        embeddings.append(np.zeros(2048))  # fallback

# Convert to numpy array and save for future use
image_embeddings = np.array(embeddings)
np.save("resnet_book_embeddings.npy", image_embeddings)

100%|██████████| 4287/4287 [02:35<00:00, 27.53it/s]


In [21]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(image_embeddings)

# Crear función para recomendar libros similares en base a similitud de coseno
def recommend_books(book_index, top_k=10):
  sim_scores = similarity_matrix[book_index]

  # Excluir propio libro
  sim_scores[book_index] = -1

  # Obtener top k índices
  top_indices = sim_scores.argsort()[::-1][:top_k]

  return books_df.iloc[top_indices][["title", "authors", "average_rating"]]

In [22]:
book_id = 10
print("Libro seleccionado:")
print(books_df.iloc[book_id][["title", "authors", "average_rating"]])

print("\Libros recomendados:")
recommend_books(book_id, 10)

Libro seleccionado:
title             Divergent (Divergent, #1)
authors                       Veronica Roth
average_rating                         4.24
Name: 10, dtype: object
\Libros recomendados:


Unnamed: 0,title,authors,average_rating
1539,"Monsters of Men (Chaos Walking, #3)",Patrick Ness,4.25
730,"Four: A Divergent Story Collection (Divergent,...",Veronica Roth,4.1
1248,"The Young Elites (The Young Elites, #1)",Marie Lu,3.93
2025,"The Thousand-Dollar Tan Line (Veronica Mars, #1)","Rob Thomas, Jennifer Graham",3.99
153,"Dead Until Dark (Sookie Stackhouse, #1)",Charlaine Harris,3.96
781,The Amityville Horror,Jay Anson,3.82
976,Flight Behavior,Barbara Kingsolver,3.75
3074,His Last Bow: 8 Stories,Arthur Conan Doyle,4.27
305,Breakfast of Champions,Kurt Vonnegut Jr.,4.08
438,The Absolutely True Diary of a Part-Time Indian,"Sherman Alexie, Ellen Forney",4.11


In [23]:
# Analyze embeddings format
print(image_embeddings.shape)
print(image_embeddings[0])

(4287, 2048)
[0.00362244 0.         0.82107866 ... 0.         0.1243109  0.26179948]


In [24]:
import json

with open("goodreads_past_interactions.json") as f:
    train_interactions = json.load(f)

with open("goodreads_test_interactions.json") as f:
    test_interactions = json.load(f)

In [25]:
bookid2idx = {id_: i for i, id_ in enumerate(books_df.book_id)}
idx2bookid = {i: id_ for i, id_ in enumerate(books_df.book_id)}

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

# Función que calcula la media de los embeddings de interacciones del usuario y genera recomendaciones en base a eso
def recommend_for_user(user_books, top_k=10):
    vectors = []

    for book_id in user_books:
        if book_id in bookid2idx:
            vectors.append(image_embeddings[bookid2idx[book_id]])

    if not vectors:
        return []

    user_profile = np.mean(vectors, axis=0).reshape(1, -1)
    similarities = cosine_similarity(user_profile, image_embeddings).flatten()

    # Eliminar libros con los cuales el usuario ya interactuo
    for book_id in user_books:
        if book_id in bookid2idx:
            similarities[bookid2idx[book_id]] = -1

    top_indices = similarities.argsort()[::-1][:top_k]
    return [idx2bookid[i] for i in top_indices]

In [27]:
# Métricas de evaluación
def precision_at_k(preds, truth, k):
    return len(set(preds[:k]) & set(truth)) / k

def recall_at_k(preds, truth, k):
    return len(set(preds[:k]) & set(truth)) / len(truth) if truth else 0

def ndcg_at_k(preds, truth, k):
    dcg = 0.0
    for i, p in enumerate(preds[:k]):
        if p in truth:
            dcg += 1 / np.log2(i + 2)
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(truth), k)))
    return dcg / idcg if idcg > 0 else 0

def map_at_k(preds, truth, k):
    score = 0.0
    hits = 0
    for i, p in enumerate(preds[:k]):
        if p in truth:
            hits += 1
            score += hits / (i + 1)
    return score / min(len(truth), k) if truth else 0

In [30]:
K = 10
metrics = {
    "precision": [],
    "recall": [],
    "ndcg": [],
    "map": [],
}

users_evaluated = 0

for user, past_books in train_interactions.items():
    test_books = test_interactions.get(user, [])
    if not test_books:
        continue

    recommendations = recommend_for_user(past_books, top_k=K)

    metrics["precision"].append(precision_at_k(recommendations, test_books, K))
    metrics["recall"].append(recall_at_k(recommendations, test_books, K))
    metrics["ndcg"].append(ndcg_at_k(recommendations, test_books, K))
    metrics["map"].append(map_at_k(recommendations, test_books, K))

    users_evaluated += 1

# Imprimir promedio de métricas
print(f"Se evaluaron {users_evaluated} usuarios en total.\n")
for name, scores in metrics.items():
    print(f"{name.upper()}@{K}: {np.mean(scores):.4f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
       0.3754183], dtype=float32), array([0.        , 0.4099214 , 0.50665355, ..., 0.24102816, 0.12368713,
       0.55204254], dtype=float32), array([0.        , 0.12002135, 0.57231903, ..., 1.0070345 , 0.        ,
       1.1893115 ], dtype=float32), array([0.        , 0.32854724, 0.2001894 , ..., 0.26274717, 0.        ,
       0.08688491], dtype=float32), array([4.4973339e-03, 5.1574211e+00, 0.0000000e+00, ..., 0.0000000e+00,
       0.0000000e+00, 5.7459455e+00], dtype=float32), array([0.        , 2.052932  , 0.00889368, ..., 0.        , 0.        ,
       0.        ], dtype=float32), array([0.       , 0.       , 0.       , ..., 0.3647148, 4.704932 ,
       0.       ], dtype=float32), array([0.16899459, 0.8027674 , 0.5016459 , ..., 0.000845  , 0.        ,
       0.19864911], dtype=float32), array([0.36229822, 2.7653048 , 0.        , ..., 0.        , 0.13091502,
       0.19025283], dtype=float32), array([0.09263193, 0.704