### Configuración Inicial

In [1]:
!pip uninstall -y numpy
!pip install numpy==1.26

Found existing installation: numpy 1.26.0
Uninstalling numpy-1.26.0:
  Successfully uninstalled numpy-1.26.0
Collecting numpy==1.26
  Using cached numpy-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Using cached numpy-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.9 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.0 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.0 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.0 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.0 which 

In [2]:
!pip install scikit-surprise --no-build-isolation --no-deps
!pip install memory_profiler

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl size=2708544 sha256=7a63c02253e4d6d4929678087eb7c0361523a88961c4b23451ac061f4643c5a7
  Stored in directory: /root/.cache/pip/wheels/75/fa/bc/739bc2cb1fbaab6061854e6cfbb81a0ae52c92a502a7fa454b
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4
Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


### Instalación de Librerías

In [3]:
import time
import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from memory_profiler import memory_usage
import itertools
import scipy.sparse as sparse
import random
import gdown
from surprise import SVDpp, Dataset, Reader, accuracy
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader

### Importación de los Datos

In [4]:
gdown.download(id='1eGDDR1wlvR99eoCZG2owChy2dhkPp4yx', output='training_ratings.csv', quiet=False)
gdown.download(id='1oHo9HLB6SzeqZs76FCkfQ1irSQepqp16', output='validation_ratings.csv', quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1eGDDR1wlvR99eoCZG2owChy2dhkPp4yx
From (redirected): https://drive.google.com/uc?id=1eGDDR1wlvR99eoCZG2owChy2dhkPp4yx&confirm=t&uuid=c48e1823-e1b7-40f9-8b85-9e46fe35df66
To: /content/training_ratings.csv
100%|██████████| 205M/205M [00:02<00:00, 68.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1oHo9HLB6SzeqZs76FCkfQ1irSQepqp16
To: /content/validation_ratings.csv
100%|██████████| 64.4M/64.4M [00:01<00:00, 63.4MB/s]


'validation_ratings.csv'

In [5]:
df_train = pd.read_csv('training_ratings.csv')
df_val = pd.read_csv('validation_ratings.csv')

In [6]:
# dataset mechanics
gdown.download(id='1cVGSLNVqxrAoKzeqxt_FfQ4Ggs9VvCDO', output='mechanics.csv', quiet=False)
df_mechanics = pd.read_csv('mechanics.csv')

Downloading...
From: https://drive.google.com/uc?id=1cVGSLNVqxrAoKzeqxt_FfQ4Ggs9VvCDO
To: /content/mechanics.csv
100%|██████████| 7.05M/7.05M [00:00<00:00, 55.6MB/s]


### Preprocesamiento de Datos

In [7]:
df_mechanics = pd.read_csv('mechanics.csv')
# Usamos BGGId como índice para que la búsqueda sea rápida
df_mechanics.set_index('BGGId', inplace=True)
print("Datos de mecánicas cargados y listos.")

# --- Calcular la popularidad de los ítems ---
# Usamos el dataframe de entrenamiento COMPLETO (df_train) para obtener una
# medida de popularidad global y precisa.
item_popularity = df_train['item'].value_counts().to_dict()
total_interactions = len(df_train)

# Convertimos las cuentas en probabilidades para el cálculo de novedad
item_popularity_prob = {item_id: count / total_interactions for item_id, count in item_popularity.items()}
print(f"Popularidad calculada para {len(item_popularity)} ítems.")

Datos de mecánicas cargados y listos.
Popularidad calculada para 16748 ítems.


In [8]:
print(f"Tamaño original del training set: {len(df_train)}")

# se obtiene un sample debido a que hay muchos datos y se demora mucho
df_train_sample = df_train.sample(n=10000000, random_state=42)
print(f"Tamaño del nuevo training set (muestra): {len(df_train_sample)}")

# se obtiene un sample debido a que hay muchos datos y se demora mucho
df_val_sample = df_val.sample(n=500000, random_state=42)
print(f"Tamaño del nuevo validation set (muestra): {len(df_val_sample)}")

Tamaño original del training set: 10211218
Tamaño del nuevo training set (muestra): 10000000
Tamaño del nuevo validation set (muestra): 500000


In [9]:
from sklearn.metrics import ndcg_score

def novelty_at_k(group, k, popularity_prob):
    """Calcula la Novedad@K para un solo usuario/grupo."""
    group = group.sort_values('score', ascending=False)
    topk_items = group.head(k)['itemID']

    novelty_scores = []
    for item_id in topk_items:
        # Si un ítem no está en el diccionario de popularidad, se le asigna una probabilidad muy baja
        prob = popularity_prob.get(item_id, 1e-6)
        novelty_scores.append(-np.log2(prob))

    return np.mean(novelty_scores) if novelty_scores else 0.0

def diversity_at_k(group, k, mechanics_df):
    """Calcula la Diversidad@K (Intra-List Diversity) para un solo usuario/grupo."""
    group = group.sort_values('score', ascending=False)
    topk_items = group.head(k)['itemID'].tolist()

    # Nos aseguramos de que los ítems recomendados tengan datos de mecánicas
    topk_items = [item for item in topk_items if item in mechanics_df.index]

    if len(topk_items) < 2:
        return 0.0

    item_vectors = mechanics_df.loc[topk_items].values

    # Calculamos la disimilitud del coseno (1 - similitud) para todos los pares de ítems
    dissimilarity_sum = 0
    num_pairs = 0
    for i in range(len(item_vectors)):
        for j in range(i + 1, len(item_vectors)):
            sim = cosine_similarity([item_vectors[i]], [item_vectors[j]])[0][0]
            dissimilarity_sum += (1 - sim)
            num_pairs += 1

    return dissimilarity_sum / num_pairs if num_pairs > 0 else 0.0
def precision_recall_at_k(group, k):
    group = group.sort_values('score', ascending=False)
    topk = group.head(k)
    hits = topk['label'].sum()
    total_relevant = group['label'].sum()
    precision = hits / k if k > 0 else 0
    recall = hits / total_relevant if total_relevant > 0 else 0
    return precision, recall

def ndcg_at_k(group, k):
    if group['label'].sum() == 0: return 0.0
    ranked_group = group.sort_values('score', ascending=False).head(k)
    if len(ranked_group) < 2: return 0.0
    true_relevance = np.asarray([ranked_group['label'].values])
    predicted_scores = np.asarray([ranked_group['score'].values])
    return ndcg_score(true_relevance, predicted_scores)

In [10]:
df_train.drop_duplicates(inplace=True, subset=['user', 'item'])
df_val.drop_duplicates(inplace=True, subset=['user', 'item'])

In [11]:


# ==============================================================================
# 1. GENERACIÓN DE GRUPOS (Paso movido al inicio)
# ==============================================================================
# Necesitamos saber quiénes son los grupos ANTES de entrenar para crear sus perfiles.
# Usamos df_val para definir usuarios válidos (igual que en tu lógica original)

print("Generando grupos para el entrenamiento...")
user_counts = df_val['user'].value_counts() # Nota: en tu csv de val la col es 'user'
valid_users = user_counts[user_counts >= 5].index.tolist() # Filtro un poco más laxo para tener más opciones

np.random.seed(42)
num_groups = 1000
group_size = 4

# Crear grupos sintéticos
# (Aseguramos que los usuarios existan también en train para poder crear perfil)
users_in_train = set(df_train['user'].unique())
valid_users = [u for u in valid_users if u in users_in_train]

groups_list = [np.random.choice(valid_users, group_size, replace=False) for _ in range(num_groups)]
print(f"Se crearon {len(groups_list)} grupos sintéticos de tamaño {group_size}.")


Generando grupos para el entrenamiento...
Se crearon 1000 grupos sintéticos de tamaño 4.


In [12]:

# 2. CREACIÓN DE PERFILES AGREGADOS (LAS 3 ESTRATEGIAS A LA VEZ)
# ==============================================================================

def create_all_strategy_profiles(df_train, groups_list):
    all_profiles = []

    # Pre-agrupamos para velocidad
    df_train_indexed = df_train.set_index('user')

    print("Generando perfiles para Average, Least Misery y Most Pleasure...")

    for i, members in enumerate(groups_list):
        # Filtramos datos de los miembros
        relevant_data = df_train[df_train['user'].isin(members)]
        if relevant_data.empty: continue

        # --- ESTRATEGIA 1: AVERAGE (Promedio) ---
        df_avg = relevant_data.groupby('item')['rating'].mean().reset_index()
        df_avg['user'] = f'G_{i}_avg' # ID único para esta estrategia
        all_profiles.append(df_avg)

        # --- ESTRATEGIA 2: LEAST MISERY (Mínimo) ---
        df_min = relevant_data.groupby('item')['rating'].min().reset_index()
        df_min['user'] = f'G_{i}_min'
        all_profiles.append(df_min)

        # --- ESTRATEGIA 3: MOST PLEASURE (Máximo) ---
        df_max = relevant_data.groupby('item')['rating'].max().reset_index()
        df_max['user'] = f'G_{i}_max'
        all_profiles.append(df_max)

    return pd.concat(all_profiles, ignore_index=True)

# Generamos los perfiles de las 3 estrategias
df_groups_train = create_all_strategy_profiles(df_train, groups_list)

# Unimos TODO al dataset de entrenamiento
df_train_augmented = pd.concat([df_train[['user', 'item', 'rating']],
                                df_groups_train[['user', 'item', 'rating']]],
                               ignore_index=True)

print(f"Ratings originales: {len(df_train)}")
print(f"Ratings de grupos (3 estrategias): {len(df_groups_train)}")
print(f"Total para entrenar: {len(df_train_augmented)}")


Generando perfiles para Average, Least Misery y Most Pleasure...
Ratings originales: 10200445
Ratings de grupos (3 estrategias): 666825
Total para entrenar: 10867270


In [13]:

# ==============================================================================
# 3. ENTRENAMIENTO DEL MODELO (Con datos aumentados)
# ==============================================================================

def train_mf_model_augmented(df_augmented):
    print("\nIniciando entrenamiento SVD con perfiles de grupo...")
    start_time = time.time()

    min_rating = df_augmented['rating'].min()
    max_rating = df_augmented['rating'].max()
    reader = Reader(rating_scale=(min_rating, max_rating))

    data = Dataset.load_from_df(df_augmented[['user', 'item', 'rating']], reader)
    trainset = data.build_full_trainset()

    algo = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42, verbose=True)
    algo.fit(trainset)

    print(f"Modelo entrenado en {time.time() - start_time:.2f} segundos.")
    return algo

# Entrenamos
mf_algo_group = train_mf_model_augmented(df_train_augmented)



Iniciando entrenamiento SVD con perfiles de grupo...
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Modelo entrenado en 126.53 segundos.


In [14]:
# EVALUACIÓN Y RESULTADOS COMPARATIVOS
results_aggregated_models = []
K_values = [10]

# Mapeo para iterar limpiamente
strategies_map = {
    'Average': '_avg',
    'Least Misery': '_min',
    'Most Pleasure': '_max'
}

print("\nIniciando evaluación comparativa...")

for strategy_name, suffix in strategies_map.items():
    print(f"Evaluando estrategia: {strategy_name}...")
    group_eval_rows = []

    for i, members in enumerate(groups_list):
        group_id_model = f'G_{i}{suffix}' # Ej: G_0_avg

        # Obtenemos la "verdad" (Ground Truth) del grupo en validación
        # NOTA: El ground truth es siempre el mismo (basado en ratings reales),
        # lo que cambia es la predicción del modelo según la estrategia aprendida.
        member_val_data = df_val[df_val['user'].isin(members)]
        if member_val_data.empty: continue

        # Ground Truth: Asumimos Average de validación como la verdad "real" del disfrute grupal
        # (Esto es estándar para evaluar: ¿Le gustó al grupo realmente?)
        group_truth = member_val_data.groupby('item')['rating'].mean().reset_index()
        group_truth['label'] = (group_truth['rating'] >= 7).astype(int)

        # PREDICCIÓN: Aquí usamos el usuario específico de la estrategia (Ej: G_0_min)
        try:
            # Predecimos usando el perfil aprendido para esta estrategia
            group_truth['score'] = group_truth['item'].apply(
                lambda x: mf_algo_group.predict(group_id_model, x).est
            )
        except:
            # Si por alguna razón el grupo no tuvo items en train para esa estrategia
            continue

        group_truth['group_id'] = i # ID numérico simple para agrupar
        group_truth['itemID'] = group_truth['item']

        group_eval_rows.append(group_truth[['group_id', 'itemID', 'score', 'label']])

    # Unimos resultados de esta estrategia
    if not group_eval_rows:
        print(f"Advertencia: No se generaron evaluaciones para {strategy_name}")
        continue

    df_results_strat = pd.concat(group_eval_rows, ignore_index=True)
    grouped_strat = df_results_strat.groupby('group_id')

    # Calcular métricas para esta estrategia
    for k in K_values:
        avg_precision = np.mean([m[0] for m in grouped_strat.apply(lambda x: precision_recall_at_k(x, k))])
        avg_recall = np.mean([m[1] for m in grouped_strat.apply(lambda x: precision_recall_at_k(x, k))])
        avg_ndcg = grouped_strat.apply(lambda x: ndcg_at_k(x, k)).mean()
        avg_novelty = grouped_strat.apply(lambda x: novelty_at_k(x, k, item_popularity_prob)).mean()
        avg_diversity = grouped_strat.apply(lambda x: diversity_at_k(x, k, df_mechanics)).mean()

        results_aggregated_models.append({
            'Model': 'MF (Aggregated Models)',
            'Strategy': strategy_name, # Average, Least Misery, etc.
            'K': k,
            'Precision@K': avg_precision,
            'Recall@K': avg_recall,
            'nDCG@K': avg_ndcg,
            'Novelty@K': avg_novelty,
            'Diversity@K': avg_diversity
        })

# Mostrar tabla final
final_results_df = pd.DataFrame(results_aggregated_models)
print("\n--- Resultados Finales: Comparación de Estrategias (MF for Groups) ---")
print(final_results_df)


Iniciando evaluación comparativa...
Evaluando estrategia: Average...


  avg_precision = np.mean([m[0] for m in grouped_strat.apply(lambda x: precision_recall_at_k(x, k))])
  avg_recall = np.mean([m[1] for m in grouped_strat.apply(lambda x: precision_recall_at_k(x, k))])
  avg_ndcg = grouped_strat.apply(lambda x: ndcg_at_k(x, k)).mean()
  avg_novelty = grouped_strat.apply(lambda x: novelty_at_k(x, k, item_popularity_prob)).mean()
  avg_diversity = grouped_strat.apply(lambda x: diversity_at_k(x, k, df_mechanics)).mean()


Evaluando estrategia: Least Misery...


  avg_precision = np.mean([m[0] for m in grouped_strat.apply(lambda x: precision_recall_at_k(x, k))])
  avg_recall = np.mean([m[1] for m in grouped_strat.apply(lambda x: precision_recall_at_k(x, k))])
  avg_ndcg = grouped_strat.apply(lambda x: ndcg_at_k(x, k)).mean()
  avg_novelty = grouped_strat.apply(lambda x: novelty_at_k(x, k, item_popularity_prob)).mean()
  avg_diversity = grouped_strat.apply(lambda x: diversity_at_k(x, k, df_mechanics)).mean()


Evaluando estrategia: Most Pleasure...


  avg_precision = np.mean([m[0] for m in grouped_strat.apply(lambda x: precision_recall_at_k(x, k))])
  avg_recall = np.mean([m[1] for m in grouped_strat.apply(lambda x: precision_recall_at_k(x, k))])
  avg_ndcg = grouped_strat.apply(lambda x: ndcg_at_k(x, k)).mean()
  avg_novelty = grouped_strat.apply(lambda x: novelty_at_k(x, k, item_popularity_prob)).mean()



--- Resultados Finales: Comparación de Estrategias (MF for Groups) ---
                    Model       Strategy   K  Precision@K  Recall@K    nDCG@K  \
0  MF (Aggregated Models)        Average  10       0.8848  0.215542  0.961039   
1  MF (Aggregated Models)   Least Misery  10       0.8850  0.215641  0.958562   
2  MF (Aggregated Models)  Most Pleasure  10       0.8832  0.215285  0.958945   

   Novelty@K  Diversity@K  
0  10.386788     0.830329  
1  10.476683     0.831822  
2  10.313648     0.830394  


  avg_diversity = grouped_strat.apply(lambda x: diversity_at_k(x, k, df_mechanics)).mean()
