### Configuración Inicial

In [None]:
!pip uninstall -y numpy
!pip install numpy==1.26

Found existing installation: numpy 1.26.0
Uninstalling numpy-1.26.0:
  Successfully uninstalled numpy-1.26.0
Collecting numpy==1.26
  Using cached numpy-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Using cached numpy-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.9 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.0 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.0 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.0 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.0 which 

In [None]:
!pip install scikit-surprise --no-build-isolation --no-deps
!pip install memory_profiler

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl size=2708559 sha256=d7db1dc5aa9b3c7bd8859f156f388cedc6e64e5e2895f6d51834d4bbaeeb9a2b
  Stored in directory: /root/.cache/pip/wheels/75/fa/bc/739bc2cb1fbaab6061854e6cfbb81a0ae52c92a502a7fa454b
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4
Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


### Instalación de Librerías

In [None]:
import time
import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from memory_profiler import memory_usage
import itertools
import scipy.sparse as sparse
import random
import gdown
from surprise import SVDpp, Dataset, Reader, accuracy
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader

### Importación de los Datos

In [None]:
gdown.download(id='1eGDDR1wlvR99eoCZG2owChy2dhkPp4yx', output='training_ratings.csv', quiet=False)
gdown.download(id='1oHo9HLB6SzeqZs76FCkfQ1irSQepqp16', output='validation_ratings.csv', quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1eGDDR1wlvR99eoCZG2owChy2dhkPp4yx
From (redirected): https://drive.google.com/uc?id=1eGDDR1wlvR99eoCZG2owChy2dhkPp4yx&confirm=t&uuid=d9255e43-8d7e-4abb-8fac-86b43f8b9f6d
To: /content/training_ratings.csv
100%|██████████| 205M/205M [00:01<00:00, 142MB/s]
Downloading...
From: https://drive.google.com/uc?id=1oHo9HLB6SzeqZs76FCkfQ1irSQepqp16
To: /content/validation_ratings.csv
100%|██████████| 64.4M/64.4M [00:00<00:00, 140MB/s]


'validation_ratings.csv'

In [None]:
df_train = pd.read_csv('training_ratings.csv')
df_val = pd.read_csv('validation_ratings.csv')

In [None]:
# dataset mechanics
gdown.download(id='1cVGSLNVqxrAoKzeqxt_FfQ4Ggs9VvCDO', output='mechanics.csv', quiet=False)
df_mechanics = pd.read_csv('mechanics.csv')

Downloading...
From: https://drive.google.com/uc?id=1cVGSLNVqxrAoKzeqxt_FfQ4Ggs9VvCDO
To: /content/mechanics.csv
100%|██████████| 7.05M/7.05M [00:00<00:00, 232MB/s]


### Preprocesamiento de Datos

In [None]:
df_mechanics = pd.read_csv('mechanics.csv')
# Usamos BGGId como índice para que la búsqueda sea rápida
df_mechanics.set_index('BGGId', inplace=True)
print("Datos de mecánicas cargados y listos.")

# --- Calcular la popularidad de los ítems ---
# Usamos el dataframe de entrenamiento COMPLETO (df_train) para obtener una
# medida de popularidad global y precisa.
item_popularity = df_train['item'].value_counts().to_dict()
total_interactions = len(df_train)

# Convertimos las cuentas en probabilidades para el cálculo de novedad
item_popularity_prob = {item_id: count / total_interactions for item_id, count in item_popularity.items()}
print(f"Popularidad calculada para {len(item_popularity)} ítems.")

Datos de mecánicas cargados y listos.
Popularidad calculada para 16748 ítems.


In [None]:
def novelty_at_k(group, k, popularity_prob):
    """Calcula la Novedad@K para un solo usuario/grupo."""
    group = group.sort_values('score', ascending=False)
    topk_items = group.head(k)['itemID']

    novelty_scores = []
    for item_id in topk_items:
        # Si un ítem no está en el diccionario de popularidad, se le asigna una probabilidad muy baja
        prob = popularity_prob.get(item_id, 1e-6)
        novelty_scores.append(-np.log2(prob))

    return np.mean(novelty_scores) if novelty_scores else 0.0

def diversity_at_k(group, k, mechanics_df):
    """Calcula la Diversidad@K (Intra-List Diversity) para un solo usuario/grupo."""
    group = group.sort_values('score', ascending=False)
    topk_items = group.head(k)['itemID'].tolist()

    # Nos aseguramos de que los ítems recomendados tengan datos de mecánicas
    topk_items = [item for item in topk_items if item in mechanics_df.index]

    if len(topk_items) < 2:
        return 0.0

    item_vectors = mechanics_df.loc[topk_items].values

    # Calculamos la disimilitud del coseno (1 - similitud) para todos los pares de ítems
    dissimilarity_sum = 0
    num_pairs = 0
    for i in range(len(item_vectors)):
        for j in range(i + 1, len(item_vectors)):
            sim = cosine_similarity([item_vectors[i]], [item_vectors[j]])[0][0]
            dissimilarity_sum += (1 - sim)
            num_pairs += 1

    return dissimilarity_sum / num_pairs if num_pairs > 0 else 0.0

def train_mf_model(df_train):
    """
    Entrena un modelo de Matrix Factorization (SVD de Surprise)
    usando el mismo esquema de datos que ya tienes:
    columnas: user, item, rating.
    """
    min_rating = df_train['rating'].min()
    max_rating = df_train['rating'].max()
    reader = Reader(rating_scale=(min_rating, max_rating))

    data = Dataset.load_from_df(df_train[['user', 'item', 'rating']], reader)
    trainset = data.build_full_trainset()

    algo = SVDpp(
        n_factors=50,
        n_epochs=20,
        lr_all=0.005,
        reg_all=0.02,
        random_state=42
    )
    algo.fit(trainset)
    return algo


In [None]:
df_train.drop_duplicates(inplace=True, subset=['user', 'item'])
df_val.drop_duplicates(inplace=True, subset=['user', 'item'])

In [None]:
print(f"Tamaño original del training set: {len(df_train)}")

# se obtiene un sample debido a que hay muchos datos y se demora mucho
df_train_sample = df_train.sample(n=10000000, random_state=42)
print(f"Tamaño del nuevo training set (muestra): {len(df_train_sample)}")

# se obtiene un sample debido a que hay muchos datos y se demora mucho
df_val_sample = df_val.sample(n=500000, random_state=42)
print(f"Tamaño del nuevo validation set (muestra): {len(df_val_sample)}")

Tamaño original del training set: 10200445
Tamaño del nuevo training set (muestra): 10000000
Tamaño del nuevo validation set (muestra): 500000


In [None]:
mf_algo = train_mf_model(df_train_sample)

In [None]:
def build_eval_df_mf(df_val_sample, algo):
    """
    Crea un DataFrame de evaluación con las mismas columnas que usas para Random,
    pero usando como 'score' la predicción del modelo MF.
    """
    df_eval_mf = df_val_sample.copy()
    df_eval_mf = df_eval_mf.rename(columns={'user': 'userID', 'item': 'itemID'})

    # Etiqueta de relevancia (igual que antes)
    df_eval_mf['label'] = (df_eval_mf['rating'] >= 7).astype(int)

    # Predicción MF para cada interacción usuario-item del conjunto de validación
    df_eval_mf['score'] = df_eval_mf.apply(
        lambda row: algo.predict(row['userID'], row['itemID']).est,
        axis=1
    )

    return df_eval_mf


# Medir tiempo de ejecución
start_time = time.time()
df_eval_mf = build_eval_df_mf(df_val_sample, mf_algo)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Tiempo de ejecución: {elapsed_time:.2f} segundos")

Tiempo de ejecución: 146.34 segundos


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

def rmse_mae_mf(df_eval_mf):
    y_true = df_eval_mf['rating'].values
    y_pred = df_eval_mf['score'].values
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae

rmse_mf, mae_mf = rmse_mae_mf(df_eval_mf)
print("MF - RMSE:", rmse_mf)
print("MF - MAE:", mae_mf)


MF - RMSE: 1.2515531550251984
MF - MAE: 0.9479160424083248


In [None]:
# --- Funciones de métrica de ranking (puedes moverlas si ya las tienes en otra celda) ---
def precision_recall_at_k(group, k):
    group = group.sort_values('score', ascending=False)
    topk = group.head(k)
    hits = topk['label'].sum()
    total_relevant = group['label'].sum()
    precision = hits / k if k > 0 else 0
    recall = hits / total_relevant if total_relevant > 0 else 0
    return precision, recall

def ndcg_at_k(group, k):
    if group['label'].sum() == 0: return 0.0
    ranked_group = group.sort_values('score', ascending=False).head(k)
    if len(ranked_group) < 2: return 0.0
    true_relevance = np.asarray([ranked_group['label'].values])
    predicted_scores = np.asarray([ranked_group['score'].values])
    return ndcg_score(true_relevance, predicted_scores)

## Individuales

In [None]:
from sklearn.metrics import ndcg_score

K_values = [10]
individual_results_mf = []
print("Calculando métricas de ranking individuales para MF...")

# El df_eval_mf ya tiene todo lo que necesitamos: userID, itemID, label, score
grouped_users = df_eval_mf.groupby('userID')

for k in K_values:
    # Métricas de precisión y ranking
    metrics = grouped_users.apply(lambda x: precision_recall_at_k(x, k))
    avg_precision = np.mean([m[0] for m in metrics])
    avg_recall = np.mean([m[1] for m in metrics])
    avg_ndcg = grouped_users.apply(lambda x: ndcg_at_k(x, k)).mean()

    # Métricas de Novedad y Diversidad
    avg_novelty = grouped_users.apply(lambda x: novelty_at_k(x, k, item_popularity_prob)).mean()
    avg_diversity = grouped_users.apply(lambda x: diversity_at_k(x, k, df_mechanics)).mean()

    individual_results_mf.append({
        'K': k,
        'Precision@K': avg_precision,
        'Recall@K': avg_recall,
        'nDCG@K': avg_ndcg,
        'Novelty@K': avg_novelty,
        'Diversity@K': avg_diversity
    })

individual_results_mf_df = pd.DataFrame(individual_results_mf)
print("\n--- Resultados de Evaluación Individual (MF) ---")
print(individual_results_mf_df)

Calculando métricas de ranking individuales para MF...


  metrics = grouped_users.apply(lambda x: precision_recall_at_k(x, k))
  avg_ndcg = grouped_users.apply(lambda x: ndcg_at_k(x, k)).mean()
  avg_novelty = grouped_users.apply(lambda x: novelty_at_k(x, k, item_popularity_prob)).mean()



--- Resultados de Evaluación Individual (MF) ---
    K  Precision@K  Recall@K    nDCG@K  Novelty@K  Diversity@K
0  10     0.205777  0.867373  0.525121    10.9902     0.494416


  avg_diversity = grouped_users.apply(lambda x: diversity_at_k(x, k, df_mechanics)).mean()


## Grupales

In [None]:
from sklearn.metrics import ndcg_score

# Asegúrate de que df_eval está definido como en el paso anterior.
# df_eval ya contiene las columnas: userID, itemID, rating, label, y score aleatorio.

print("\nCreando grupos sintéticos (MF)...")
user_counts = df_eval_mf['userID'].value_counts()
valid_users = user_counts[user_counts >= 10].index.tolist()

np.random.seed(42)
num_groups = 1000
group_size = 4
if len(valid_users) < group_size * num_groups:
    print(f"Advertencia: No hay suficientes usuarios únicos ({len(valid_users)}) para crear {num_groups} grupos sin reemplazo. Se crearán menos grupos.")
    num_groups = len(valid_users) // group_size

groups = [np.random.choice(valid_users, group_size, replace=False) for _ in range(num_groups)]
print(f"Se crearon {len(groups)} grupos sintéticos de tamaño {group_size}.")


print("\nAgregando predicciones MF para cada grupo...")
all_group_recs = []
for group_id, user_ids in enumerate(groups):
    group_predictions = df_eval_mf[df_eval_mf['userID'].isin(user_ids)]
    item_scores_per_group = group_predictions.groupby('itemID').agg(
        avg_score=('score', 'mean'),
        min_score=('score', 'min'),
        max_score=('score', 'max'),
        group_label=('label', lambda x: 1 if all(x == 1) else 0)
    ).reset_index()
    item_scores_per_group['group_id'] = group_id
    all_group_recs.append(item_scores_per_group)

df_group_eval_mf = pd.concat(all_group_recs, ignore_index=True)
print("Agregación completada.")


# --- Evaluación de Estrategias con Todas las Métricas ---
strategies = {
    'Average': 'avg_score',
    'Least Misery': 'min_score',
    'Most Pleasure': 'max_score'
}

group_results_mf = []
K_values = [10]

for strategy_name, score_column in strategies.items():
    print(f"\nEvaluando estrategia (MF): {strategy_name}...")
    df_strategy_eval = df_group_eval_mf[['group_id', 'itemID', 'group_label']].copy()
    df_strategy_eval.rename(columns={'group_label': 'label'}, inplace=True)
    df_strategy_eval['score'] = df_group_eval_mf[score_column]

    grouped_strategy = df_strategy_eval.groupby('group_id')

    for k in K_values:
        metrics = grouped_strategy.apply(lambda x: precision_recall_at_k(x, k))
        avg_precision = np.mean([m[0] for m in metrics])
        avg_recall = np.mean([m[1] for m in metrics])
        avg_ndcg = grouped_strategy.apply(lambda x: ndcg_at_k(x, k)).mean()
        avg_novelty = grouped_strategy.apply(lambda x: novelty_at_k(x, k, item_popularity_prob)).mean()
        avg_diversity = grouped_strategy.apply(lambda x: diversity_at_k(x, k, df_mechanics)).mean()

        group_results_mf.append({
            'Model': 'MF',
            'Strategy': strategy_name,
            'K': k,
            'Precision@K': avg_precision,
            'Recall@K': avg_recall,
            'nDCG@K': avg_ndcg,
            'Novelty@K': avg_novelty,
            'Diversity@K': avg_diversity
        })

group_results_mf_df = pd.DataFrame(group_results_mf)
print("\n--- Resultados de Evaluación Grupal para MF ---")
print(group_results_mf_df)



Creando grupos sintéticos (MF)...
Se crearon 1000 grupos sintéticos de tamaño 4.

Agregando predicciones MF para cada grupo...
Agregación completada.

Evaluando estrategia (MF): Average...


  metrics = grouped_strategy.apply(lambda x: precision_recall_at_k(x, k))
  avg_ndcg = grouped_strategy.apply(lambda x: ndcg_at_k(x, k)).mean()
  avg_novelty = grouped_strategy.apply(lambda x: novelty_at_k(x, k, item_popularity_prob)).mean()
  avg_diversity = grouped_strategy.apply(lambda x: diversity_at_k(x, k, df_mechanics)).mean()



Evaluando estrategia (MF): Least Misery...


  metrics = grouped_strategy.apply(lambda x: precision_recall_at_k(x, k))
  avg_ndcg = grouped_strategy.apply(lambda x: ndcg_at_k(x, k)).mean()
  avg_novelty = grouped_strategy.apply(lambda x: novelty_at_k(x, k, item_popularity_prob)).mean()
  avg_diversity = grouped_strategy.apply(lambda x: diversity_at_k(x, k, df_mechanics)).mean()



Evaluando estrategia (MF): Most Pleasure...


  metrics = grouped_strategy.apply(lambda x: precision_recall_at_k(x, k))
  avg_ndcg = grouped_strategy.apply(lambda x: ndcg_at_k(x, k)).mean()
  avg_novelty = grouped_strategy.apply(lambda x: novelty_at_k(x, k, item_popularity_prob)).mean()



--- Resultados de Evaluación Grupal para MF ---
  Model       Strategy   K  Precision@K  Recall@K    nDCG@K  Novelty@K  \
0    MF        Average  10       0.8958  0.257892  0.972665  10.631384   
1    MF   Least Misery  10       0.8965  0.258081  0.972729  10.642940   
2    MF  Most Pleasure  10       0.8907  0.256371  0.970726  10.619368   

   Diversity@K  
0     0.839455  
1     0.839712  
2     0.839604  


  avg_diversity = grouped_strategy.apply(lambda x: diversity_at_k(x, k, df_mechanics)).mean()
