# Proyecto

## 1. Introducción

## 2. Pasos preliminares

### 2.1. Instalación de dependencias


In [None]:
%pip install seaborn
%pip install prettytable


Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 KB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import os


### 2.2. Cargar datos

In [98]:
ratings_df = pd.read_csv('videogame_ratings.csv')   # userId, itemId, rating, timestamp
videogames_df = pd.read_csv('videogames.csv')       # itemId, name

ratings_df.head()

Unnamed: 0,itemId,userId,rating,timestamp
0,439381673,A21ROB4YDOZA5P,1.0,1402272000
1,439381673,A3TNZ2Q5E7HTHD,3.0,1399680000
2,439381673,A1OKRM3QFEATQO,4.0,1391731200
3,439381673,A2XO1JFCNEYV3T,1.0,1391731200
4,439381673,A19WLPIRHD15TH,4.0,1389830400


In [99]:
# shape ratings df and videogames df, unique userId

print(f"ratings_df shape: {ratings_df.shape}")
print(f"videogames_df shape: {videogames_df.shape}")
print(f"N° Usuarios: {ratings_df['userId'].nunique()}")
print(f"N° Videogames: {ratings_df['itemId'].nunique()}")

ratings_df shape: (2565349, 4)
videogames_df shape: (84819, 2)
N° Usuarios: 1540618
N° Videogames: 71982


### 2.3. Preprocesar datos

In [100]:
print(videogames_df['name'].duplicated().sum())  

16154


Existen videojuegos que poseen más de 1 ID en Amazon, esto es debido a versiones/metadata distinta. Por lo tanto, realizamos un preprocesamiento donde dejamos un solo ID por videojuego. (El primer ID encontrado) y lo reemplazamos en el dataset de reviews. Esto se hace para evitar duplicados y que el modelo no aprenda de más de un ID por videojuego. 

In [107]:
import pandas as pd

# 1. Mapeo de nombres a IDs reales y de Ids a reales
name_to_real_id = videogames_df.groupby('name')['itemId'].first()
videogames_df['realId'] = videogames_df['name'].map(name_to_real_id)
id_to_real = videogames_df.set_index('itemId')['realId'].to_dict()

# 3. Reemplazamos itemId en ratings_df
ratings_df['itemId'] = ratings_df['itemId'].map(id_to_real)
videogames_df['itemId'] = videogames_df['realId']

# 4. Limpiamos el DataFrame de videojuegos
clean_videogames_df = (
    videogames_df
    .drop(columns=['realId'])
    .drop_duplicates(subset='itemId')
    .reset_index(drop=True)
)

videogames_df = clean_videogames_df


In [106]:
print(f"N° Videogames: {ratings_df['itemId'].nunique()}")


N° Videogames: 68662


### 2.4. Particionar datos

In [None]:
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.2 
DATASET_SIZE = 0.05
RANDOM_STATE = 42

df = ratings_df.sample(frac=DATASET_SIZE, random_state=RANDOM_STATE)

# 1. Split inicial utilizando TEST_SIZE
train_df, val_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# 2. Detectar cold-start en validación (no queremos usuarios que no estén en train)
train_users = set(train_df['userId'])
cold_mask = ~val_df['userId'].isin(train_users)

# 3. Separar interacciones frías
cold_df = val_df[cold_mask]
val_df = val_df[~cold_mask]

# 4. Reintegrar las frías al entrenamiento
train_df = pd.concat([train_df, cold_df], ignore_index=True)

# 5. Ajustar test al valor TEST_SIZE exacto: (Los pasos anteriores reducen el tamaño de val_df)
desired_n_val = int(TEST_SIZE * len(df))
current_n_val = len(val_df)
n_needed = desired_n_val - current_n_val

if n_needed > 0:
    # Tomamos n_needed ejemplos adicionales desde train_df
    extra_val = train_df.sample(n=n_needed, random_state=RANDOM_STATE)
    
    # Los quitamos de train y los añadimos a val
    train_df = train_df.drop(extra_val.index).reset_index(drop=True)
    val_df = pd.concat([val_df, extra_val], ignore_index=True)

# 6. Comprobación de tamaños
print(f"Train: {train_df.shape[0]} filas")
print(f"Validation: {val_df.shape[0]} filas")
print(f"Real test size: {val_df.shape[0] / (train_df.shape[0] + val_df.shape[0]):.2%}")

Train: 102614 filas
Validation: 25653 filas
Real test size: 20.00%


### 2.5. Estadísticas de los datos

In [None]:
from prettytable import PrettyTable

def df_stats(df):
    n_users = df['userId'].nunique()
    n_items = df['itemId'].nunique()
    total_ratings = len(df)
    avg_rpu = round(total_ratings / n_users, 1)
    avg_rpi = round(total_ratings / n_items, 1)
    avg_rating = round(df['rating'].mean(), 1)
    std_rating = round(df['rating'].std(), 1)
    max_by_user = df.groupby('userId').size().max()
    max_by_item = df.groupby('itemId').size().max()
    return {
        'Number of Users': n_users,
        'Number of Items': n_items,
        'Total Ratings': total_ratings,
        'Average Number of Ratings per User': avg_rpu,
        'Average Number of Ratings per Item': avg_rpi,
        'Average Rating': avg_rating,
        'Rating Standard Deviation': std_rating,
        'Highest Number of Ratings by a User': max_by_user,
        'Highest Number of Ratings for an Item': max_by_item,
    }


In [None]:
stats_train = df_stats(train_df)
stats_val   = df_stats(val_df)

table = PrettyTable()
table.title = "Tabla estatísticas de entrenamiento y validación"
table.field_names = ["Métrica", "Train", "Validation"]
for metric in stats_train.keys():
    table.add_row([metric, stats_train[metric], stats_val[metric]])
print(table)


+-------------------------------------------------------------+
|       Tabla estatísticas de entrenamiento y validación      |
+---------------------------------------+--------+------------+
|                 Metric                | Train  | Validation |
+---------------------------------------+--------+------------+
|            Number of Users            | 97928  |   24500    |
|            Number of Items            | 23174  |   10973    |
|             Total Ratings             | 102614 |   25653    |
|   Average Number of Ratings per User  |  1.0   |    1.0     |
|   Average Number of Ratings per Item  |  4.4   |    2.3     |
|             Average Rating            |  4.0   |    4.0     |
|       Rating Standard Deviation       |  1.4   |    1.4     |
|  Highest Number of Ratings by a User  |   29   |     14     |
| Highest Number of Ratings for an Item |  348   |     74     |
|             Densidad ( %)             | 0.0 %  |   0.0 %    |
+---------------------------------------