## Random

### Configuración Inicial

In [2]:
!pip uninstall -y numpy
!pip install numpy==1.26

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy==1.26
  Downloading numpy-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.9/17.9 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.0 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.0 which is inco

In [1]:
!pip install scikit-surprise --no-build-isolation --no-deps
!pip install memory_profiler

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl size=2708551 sha256=aa74f90da1aad64c4bf36db86dadcda0be4209339a67cee214692c1fdb0dfdfb
  Stored in directory: /root/.cache/pip/wheels/75/fa/bc/739bc2cb1fbaab6061854e6cfbb81a0ae52c92a502a7fa454b
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4
Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-

### Instalación de Librerías

In [2]:
import time
import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from memory_profiler import memory_usage
import itertools
import scipy.sparse as sparse
import random
import gdown
from surprise import SVDpp, Dataset, Reader, accuracy
from sklearn.metrics import mean_squared_error, mean_absolute_error

### Importación de los Datos

In [3]:
gdown.download(id='1H_24ycns6zbOVfHFJRI9vGjVffVA5z6v', output='training_ratings.csv', quiet=False)
gdown.download(id='1pKmf07ehHOmlvIyT8nv__vPuWE2Z3ygZ', output='validation_ratings.csv', quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1H_24ycns6zbOVfHFJRI9vGjVffVA5z6v
From (redirected): https://drive.google.com/uc?id=1H_24ycns6zbOVfHFJRI9vGjVffVA5z6v&confirm=t&uuid=eb3ce19e-27a3-412d-8717-2ea7bc09d203
To: /content/training_ratings.csv
100%|██████████| 249M/249M [00:03<00:00, 70.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1pKmf07ehHOmlvIyT8nv__vPuWE2Z3ygZ
To: /content/validation_ratings.csv
100%|██████████| 58.3M/58.3M [00:01<00:00, 51.0MB/s]


'validation_ratings.csv'

In [4]:
df_train = pd.read_csv('training_ratings.csv')
df_val = pd.read_csv('validation_ratings.csv')

### Preprocesamiento de Datos

In [5]:
df_train.drop_duplicates(inplace=True, subset=['user', 'item'])
df_val.drop_duplicates(inplace=True, subset=['user', 'item'])

In [6]:
print(f"Tamaño original del training set: {len(df_train)}")

# se obtiene un sample debido a que hay muchos datos y se demora mucho
df_train_sample = df_train.sample(n=1000000, random_state=42)
print(f"Tamaño del nuevo training set (muestra): {len(df_train_sample)}")

# se obtiene un sample debido a que hay muchos datos y se demora mucho
df_val_sample = df_val.sample(n=50000, random_state=42)
print(f"Tamaño del nuevo validation set (muestra): {len(df_val_sample)}")

Tamaño original del training set: 12390406
Tamaño del nuevo training set (muestra): 1000000
Tamaño del nuevo validation set (muestra): 50000


In [7]:
def evaluar_random_topn(df_train, df_val, n=10, sample_per_user=50):
    """
    Genera recomendaciones aleatorias para cada usuario, tomando un sample
    limitado de items no vistos para evitar usar toda la matriz.
    """
    # Diccionario {usuario: items que ha visto}
    user2seen = df_train.groupby('user')['item'].apply(set).to_dict()

    # Lista de todos los items
    all_items = df_train['item'].unique().tolist()

    top_n = {}
    for uid in df_val['user'].unique():
        seen = user2seen.get(uid, set())
        # Items posibles para recomendar
        candidates = list(set(all_items) - seen)
        # Tomar un sample limitado
        sample_candidates = random.sample(candidates, min(sample_per_user, len(candidates)))
        # Tomar n recomendaciones aleatorias
        recs = random.sample(sample_candidates, min(n, len(sample_candidates)))
        top_n[uid] = [(iid, random.randint(1,5)) for iid in recs]

    return top_n


# Medir tiempo de ejecución
start_time = time.time()
top_n = evaluar_random_topn(df_train_sample, df_val_sample, n=10)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Tiempo de ejecución: {elapsed_time:.2f} segundos")

# Medir la memoria utilizada
memoria = memory_usage(
    (evaluar_random_topn, (df_val_sample, df_val_sample), {'n':10})
)
print("Memoria usada (MB):", max(memoria) - min(memoria))

Tiempo de ejecución: 147.73 segundos
Memoria usada (MB): 38.1640625


In [8]:
def rmse_mae_from_topn(top_n, df_val_sample):
    real, predicho = [], []
    total = sum(len(recs) for recs in top_n.values())
    i = 0
    for uid, recs in top_n.items():
      for iid, pred in recs:
        real_vals = df_val_sample.loc[(df_val_sample['user'] == uid) & (df_val_sample['item'] == iid), 'rating']
        if not real_vals.empty:
            real.append(real_vals.values[0])
            predicho.append(pred)
        i += 1
        if i % 10000 == 0 or i == total:  # muestra cada 100 pasos o al final
            progreso = (i / total) * 100
            print(f"Progreso: {i}/{total} ({progreso:.2f}%)")

    return math.sqrt(mean_squared_error(real, predicho)), mean_absolute_error(real, predicho)

rmse, mae = rmse_mae_from_topn(top_n, df_val_sample)
print("RMSE para las top n recomendaciones", rmse)
print("MAE para las top n recomendaciones", mae)


Progreso: 10000/374620 (2.67%)
Progreso: 20000/374620 (5.34%)
Progreso: 30000/374620 (8.01%)
Progreso: 40000/374620 (10.68%)
Progreso: 50000/374620 (13.35%)
Progreso: 60000/374620 (16.02%)
Progreso: 70000/374620 (18.69%)
Progreso: 80000/374620 (21.35%)
Progreso: 90000/374620 (24.02%)
Progreso: 100000/374620 (26.69%)
Progreso: 110000/374620 (29.36%)
Progreso: 120000/374620 (32.03%)
Progreso: 130000/374620 (34.70%)
Progreso: 140000/374620 (37.37%)
Progreso: 150000/374620 (40.04%)
Progreso: 160000/374620 (42.71%)
Progreso: 170000/374620 (45.38%)
Progreso: 180000/374620 (48.05%)
Progreso: 190000/374620 (50.72%)
Progreso: 200000/374620 (53.39%)
Progreso: 210000/374620 (56.06%)
Progreso: 220000/374620 (58.73%)
Progreso: 230000/374620 (61.40%)
Progreso: 240000/374620 (64.06%)
Progreso: 250000/374620 (66.73%)
Progreso: 260000/374620 (69.40%)
Progreso: 270000/374620 (72.07%)
Progreso: 280000/374620 (74.74%)
Progreso: 290000/374620 (77.41%)
Progreso: 300000/374620 (80.08%)
Progreso: 310000/37462

Ahora haremos lo mismo pero para un grupo de 4 personas que han calificado ese item (haremos 4 personas por mas que haya juegos que es de máximo 2 o 10 etc por simplicidad). Lo que hacemos es calcular el promedio real de las calificaciones de esas 4 personas y compararlo contra una predicción grupal generada de manera completamente aleatoria en el rango de ratings posibles.

In [9]:
def evaluar_random_topn_grupos(df_train, df_val, n=10, sample_per_user=50):
    """
    Genera recomendaciones aleatorias para cada usuario, tomando un sample
    limitado de items no vistos para evitar usar toda la matriz.
    """
    # Diccionario {usuario: items que ha visto}
    user2seen = df_train.groupby('user')['item'].apply(set).to_dict()

    # Lista de todos los items
    all_items = df_train['item'].unique().tolist()

    top_n = {}
    # armar grupos
    usuarios = df_val['user'].unique()
    grupos = [usuarios[i:i+4] for i in range(0, len(usuarios) - len(usuarios)%4, 4)]

    seen_group = set()
    for grupo in grupos:
      for u in grupo:
          seen_group |= user2seen.get(u, set())
          # Items posibles para recomendar
          candidates = list(set(all_items) - seen_group)
          # Tomar un sample limitado
          sample_candidates = random.sample(candidates, min(sample_per_user, len(candidates)))
          # Tomar n recomendaciones aleatorias
          recs = random.sample(sample_candidates, min(n, len(sample_candidates)))
          top_n[tuple(grupo)] = [(iid, random.randint(1,5)) for iid in recs]

    return top_n


# Medir tiempo de ejecución
start_time = time.time()
top_n_grupo = evaluar_random_topn_grupos(df_train_sample, df_val_sample, n=10)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Tiempo de ejecución: {elapsed_time:.2f} segundos")

# Medir la memoria utilizada
memoria = memory_usage(
    (evaluar_random_topn_grupos, (df_train_sample, df_val_sample), {'n':10})
)
print("Memoria usada (MB):", max(memoria) - min(memoria))

Tiempo de ejecución: 198.39 segundos
Memoria usada (MB): 106.5390625


In [10]:
def rmse_mae_from_topn_grupo(top_n_grupo, df_val_sample):
    real, predicho = [], []
    total = sum(len(recs) for recs in top_n.values())
    i = 0
    for grupo, recs in top_n_grupo.items():
      for iid, pred in recs:
        real_vals = df_val_sample.loc[(df_val_sample['user'].isin(grupo)) & (df_val_sample['item'] == iid), 'rating']
        if not real_vals.empty:
            real.append(real_vals.values[0])
            predicho.append(pred)
        i += 1
        if i % 10000 == 0 or i == total:  # muestra cada 100 pasos o al final
            progreso = (i / total) * 100
            print(f"Progreso: {i}/{total} ({progreso:.2f}%)")

    return math.sqrt(mean_squared_error(real, predicho)), mean_absolute_error(real, predicho)

rmse_grupo, mae_grupo = rmse_mae_from_topn_grupo(top_n_grupo, df_val_sample)
print("RMSE para las top n recomendaciones", rmse_grupo)
print("MAE para las top n recomendaciones", mae_grupo)


Progreso: 10000/374620 (2.67%)
Progreso: 20000/374620 (5.34%)
Progreso: 30000/374620 (8.01%)
Progreso: 40000/374620 (10.68%)
Progreso: 50000/374620 (13.35%)
Progreso: 60000/374620 (16.02%)
Progreso: 70000/374620 (18.69%)
Progreso: 80000/374620 (21.35%)
Progreso: 90000/374620 (24.02%)
RMSE para las top n recomendaciones 4.415880433163924
MAE para las top n recomendaciones 4.0


Los códigos para random se adaptaron de un codigo inicial creado, la adaptación de este código se encuentra aquí: https://chatgpt.com/share/68e00c31-dbf8-8006-bacd-84f0296d467c


In [11]:
from sklearn.metrics import ndcg_score

print("Creando DataFrame de evaluación aleatorio...")

df_eval = df_val_sample.copy()

df_eval = df_eval.rename(columns={'user': 'userID', 'item': 'itemID'})

df_eval['label'] = (df_eval['rating'] >= 7).astype(int)
np.random.seed(42)
df_eval['score'] = np.random.rand(len(df_eval))

print("DataFrame de evaluación (Random) creado con éxito.")
print(df_eval.head())

print("\nCreando grupos sintéticos...")
user_counts = df_eval['userID'].value_counts()
valid_users = user_counts[user_counts >= 10].index.tolist()

np.random.seed(42)
num_groups = 1000
group_size = 4
groups = [np.random.choice(valid_users, group_size, replace=False) for _ in range(num_groups)]

print(f"Se crearon {len(groups)} grupos sintéticos de tamaño {group_size}.")
print("Ejemplo de un grupo:", groups[0])

print("\nAgregando predicciones para cada grupo...")
all_group_recs = []

for group_id, user_ids in enumerate(groups):
    group_predictions = df_eval[df_eval['userID'].isin(user_ids)]

    item_scores_per_group = group_predictions.groupby('itemID').agg(
        avg_score=('score', 'mean'),
        min_score=('score', 'min'),
        max_score=('score', 'max'),

        group_label=('label', lambda x: 1 if all(x == 1) else 0)
    ).reset_index()

    item_scores_per_group['group_id'] = group_id
    all_group_recs.append(item_scores_per_group)

df_group_eval = pd.concat(all_group_recs, ignore_index=True)
print("Agregación completada.")
print(df_group_eval.head())


def precision_recall_at_k(group, k):
    """Calcula Precision@K y Recall@K para un solo grupo."""
    group = group.sort_values('score', ascending=False)
    topk = group.head(k)
    hits = topk['label'].sum()
    total_relevant = group['label'].sum()
    precision = hits / k if k > 0 else 0
    recall = hits / total_relevant if total_relevant > 0 else 0
    return precision, recall

def ndcg_at_k(group, k):
    """Calcula nDCG@K para un solo grupo."""
    if group['label'].sum() == 0:
        return 0.0
    ranked_group = group.sort_values('score', ascending=False).head(k)
    if len(ranked_group) < 2:
        return 0.0
    true_relevance = np.asarray([ranked_group['label'].values])
    predicted_scores = np.asarray([ranked_group['score'].values])
    return ndcg_score(true_relevance, predicted_scores)

strategies = {
    'Average': 'avg_score',
    'Least Misery': 'min_score',
    'Most Pleasure': 'max_score'
}

group_results = []
K_values = [10]

for strategy_name, score_column in strategies.items():
    print(f"\nEvaluando estrategia (Random): {strategy_name}...")
    df_strategy_eval = df_group_eval[['group_id', 'itemID', 'group_label']].copy()
    df_strategy_eval.rename(columns={'group_label': 'label'}, inplace=True)
    df_strategy_eval['score'] = df_group_eval[score_column]

    grouped_strategy = df_strategy_eval.groupby('group_id')

    for k in K_values:
        metrics = grouped_strategy.apply(lambda x: precision_recall_at_k(x, k))
        avg_precision = np.mean([m[0] for m in metrics])
        avg_recall = np.mean([m[1] for m in metrics])
        ndcg_scores = grouped_strategy.apply(lambda x: ndcg_at_k(x, k))
        avg_ndcg = np.mean(ndcg_scores)
        group_results.append({
            'Model': 'Random',
            'Strategy': strategy_name,
            'K': k,
            'Precision@K': avg_precision,
            'Recall@K': avg_recall,
            'nDCG@K': avg_ndcg
        })

group_results_df = pd.DataFrame(group_results)

print("\n--- Resultados de Evaluación Grupal para Random ---")
print(group_results_df)

Creando DataFrame de evaluación aleatorio...
DataFrame de evaluación (Random) creado con éxito.
         itemID  rating       userID  label     score
1549328  162886     7.0    bigbarazi      1  0.374540
1832787  155987    10.0        espoo      1  0.950714
1118582  156858     7.0   Scooperman      1  0.731994
788525     1899    10.0   ManiacRafa      1  0.598658
513706   156714     7.5  Hamez Davez      1  0.156019

Creando grupos sintéticos...
Se crearon 1000 grupos sintéticos de tamaño 4.
Ejemplo de un grupo: ['punkin312' 'Posco' 'leffe dubbel' 'MindSwap']

Agregando predicciones para cada grupo...
Agregación completada.
   itemID  avg_score  min_score  max_score  group_label  group_id
0      13   0.080063   0.080063   0.080063            1         0
1     655   0.872806   0.872806   0.872806            0         0
2    1144   0.273319   0.273319   0.273319            0         0
3    1829   0.404969   0.404969   0.404969            0         0
4    2569   0.005570   0.005570   0.00

  metrics = grouped_strategy.apply(lambda x: precision_recall_at_k(x, k))
  ndcg_scores = grouped_strategy.apply(lambda x: ndcg_at_k(x, k))



Evaluando estrategia (Random): Least Misery...


  metrics = grouped_strategy.apply(lambda x: precision_recall_at_k(x, k))
  ndcg_scores = grouped_strategy.apply(lambda x: ndcg_at_k(x, k))



Evaluando estrategia (Random): Most Pleasure...


  metrics = grouped_strategy.apply(lambda x: precision_recall_at_k(x, k))



--- Resultados de Evaluación Grupal para Random ---
    Model       Strategy   K  Precision@K  Recall@K    nDCG@K
0  Random        Average  10       0.4644  0.187787  0.734357
1  Random   Least Misery  10       0.4641  0.187695  0.734323
2  Random  Most Pleasure  10       0.4641  0.187562  0.730359


  ndcg_scores = grouped_strategy.apply(lambda x: ndcg_at_k(x, k))
