## Matriz de RATINGS.csv

In [4]:
import pandas as pd
from scipy.sparse import coo_matrix

In [9]:


# Cargar el archivo de ratings
df_matrix = pd.read_csv("data/ratings.csv")

# Mapear userId y movieId a índices consecutivos (opcional pero recomendable)
user_mapper = {user_id: idx for idx, user_id in enumerate(df_matrix['userId'].unique())}
movie_mapper = {movie_id: idx for idx, movie_id in enumerate(df_matrix['movieId'].unique())}

user_index = df_matrix['userId'].map(user_mapper)
movie_index = df_matrix['movieId'].map(movie_mapper)

# Construir la matriz esparsa en formato COO
ratings_sparse = coo_matrix((df_matrix['rating'], (user_index, movie_index)))

reverse_movie_mapper = {v: k for k, v in movie_mapper.items()}
ratings_csr = ratings_sparse.tocsr()

# Mostrar información de la matriz
print(f"Shape: {ratings_sparse.shape}")
print(f"Número de ratings: {ratings_sparse.nnz}")

Shape: (200948, 84432)
Número de ratings: 32000204


In [10]:
# Calcular densidad
num_users = ratings_sparse.shape[0]
num_movies = ratings_sparse.shape[1]
num_ratings = ratings_sparse.nnz

density = num_ratings / (num_users * num_movies)

print(f"Densidad: {density:.6f} ({density * 100:.4f}%)")

Densidad: 0.001886 (0.1886%)


In [11]:
import numpy as np

# Peso en bytes
data_bytes = ratings_sparse.data.nbytes
row_bytes = ratings_sparse.row.nbytes
col_bytes = ratings_sparse.col.nbytes

total_bytes = data_bytes + row_bytes + col_bytes
total_mb = total_bytes / (1024 ** 2)

print(f"Tamaño en memoria: {total_mb:.2f} MB")

Tamaño en memoria: 488.28 MB


In [1]:
import pickle
from scipy.sparse import save_npz

path = "./data/processed"

# Guardar matriz CSR
save_npz(f"{path}/ratings_csr.npz", ratings_csr)

# Guardar mapeos
with open(f"{path}/mappers.pkl", "wb") as f:
    pickle.dump({
        "user_mapper": user_mapper,
        "movie_mapper": movie_mapper,
        "reverse_movie_mapper": reverse_movie_mapper
    }, f)


NameError: name 'ratings_csr' is not defined

In [2]:
import os
disk_size_mb = os.path.getsize(path+"/ratings_csr.npz") / (1024 ** 2)
print(f"Tamaño en disco (.npz): {disk_size_mb:.2f} MB")

Tamaño en disco (.npz): 68.57 MB


## Matriz de MOVIES.csv

In [5]:
# Sobrescribir df con el contenido de movies.csv para liberar RAM
df_movies = pd.read_csv("./data/movies.csv")

In [6]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  87585 non-null  int64 
 1   title    87585 non-null  object
 2   genres   87585 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


In [12]:
from collections import Counter

movie_counts = Counter(ratings_sparse.col)
df_movies["num_ratings"] = df_movies["movieId"].map(lambda mid: movie_mapper.get(mid, -1))
df_movies["num_ratings"] = df_movies["num_ratings"].map(lambda idx: movie_counts.get(idx, 0))

In [13]:
movie_avg = df_matrix.groupby("movieId")["rating"].mean()

# Paso 2: merge al df de películas
df_movies["avg_rating"] = df_matrix["movieId"].map(movie_avg)

In [14]:
df_movies["is_good"] = (df_movies["avg_rating"] >= 4.0) & (df_movies["num_ratings"] >= 50)

In [15]:
C = df_movies["avg_rating"].mean()
m = 50

df_movies["bayesian_rating"] = (df_movies["num_ratings"] / (df_movies["num_ratings"] + m)) * df_movies["avg_rating"] + (m / (df_movies["num_ratings"] + m)) * C

In [16]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   movieId          87585 non-null  int64  
 1   title            87585 non-null  object 
 2   genres           87585 non-null  object 
 3   num_ratings      87585 non-null  int64  
 4   avg_rating       87585 non-null  float64
 5   is_good          87585 non-null  bool   
 6   bayesian_rating  87585 non-null  float64
dtypes: bool(1), float64(2), int64(2), object(2)
memory usage: 4.1+ MB


In [17]:
df_movies.tail()

Unnamed: 0,movieId,title,genres,num_ratings,avg_rating,is_good,bayesian_rating
87580,292731,The Monroy Affaire (2022),Drama,1,4.028991,False,3.548644
87581,292737,Shelter in Solitude (2023),Comedy|Drama,1,3.626571,False,3.540753
87582,292753,Orca (2023),Drama,1,3.917928,False,3.546466
87583,292755,The Angry Breed (1968),Drama,1,3.745054,False,3.543076
87584,292757,Race to the Summit (2023),Action|Adventure|Documentary,1,3.979281,False,3.547669


In [18]:
df_movies.to_csv(f"{path}/movies_bayesian.csv", index=False)

## Algoritmos

### Cargar data

In [19]:
import pickle
from scipy.sparse import load_npz
import pandas as pd
# Cambian por su ruta
path = "./data/processed"

# Cargar matriz
ratings_csr = load_npz(f"{path}/ratings_csr.npz")

# Cargar mapeos
with open(f"{path}/mappers.pkl", "rb") as f:
    data = pickle.load(f)
    user_mapper = data["user_mapper"]
    movie_mapper = data["movie_mapper"]
    reverse_movie_mapper = data["reverse_movie_mapper"]

# Cargar DataFrame con información de películas (con columnas bayesian, etc.)
df_movies = pd.read_csv(f"{path}/movies_bayesian.csv")

### KNN

In [20]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

# Creamos el modelo KNN
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5, n_jobs=-1)
model_knn.fit(ratings_csr)

0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,-1


In [23]:
user_id = 123  # ID original del dataset
user_idx = user_mapper[user_id]  # índice en la matriz

# Buscar vecinos
distances, indices = model_knn.kneighbors(ratings_csr[user_idx], n_neighbors=20)

# Mostrar resultados
print(f"Usuarios similares a {user_id}:")
for i, (idx, dist) in enumerate(zip(indices[0], distances[0])):
    real_user_id = list(user_mapper.keys())[list(user_mapper.values()).index(idx)]
    print(f"{i+1}. Usuario {real_user_id} - distancia: {dist:.4f}")


Usuarios similares a 123:
1. Usuario 123 - distancia: 0.0000
2. Usuario 151437 - distancia: 0.2773
3. Usuario 113241 - distancia: 0.2811
4. Usuario 102082 - distancia: 0.2870
5. Usuario 179305 - distancia: 0.2894
6. Usuario 93451 - distancia: 0.2934
7. Usuario 127646 - distancia: 0.2948
8. Usuario 188201 - distancia: 0.2949
9. Usuario 121230 - distancia: 0.3033
10. Usuario 86951 - distancia: 0.3047
11. Usuario 10865 - distancia: 0.3066
12. Usuario 76007 - distancia: 0.3095
13. Usuario 138042 - distancia: 0.3138
14. Usuario 155640 - distancia: 0.3146
15. Usuario 812 - distancia: 0.3157
16. Usuario 125082 - distancia: 0.3204
17. Usuario 59636 - distancia: 0.3229
18. Usuario 89320 - distancia: 0.3232
19. Usuario 155102 - distancia: 0.3240
20. Usuario 148456 - distancia: 0.3255


In [24]:
# Inicializar modelo KNN
def entrenar_knn(ratings_csr, n_neighbors=5):
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
    knn.fit(ratings_csr)
    return knn

# Buscar vecinos
def obtener_vecinos(model_knn, ratings_csr, user_id, k=5):
    distancias, indices = model_knn.kneighbors(ratings_csr[user_id], n_neighbors=k)
    return distancias[0], indices[0]


### Agregar usuario y ratings

In [25]:
import numpy as np
from scipy.sparse import vstack, csr_matrix
from sklearn.neighbors import NearestNeighbors

# Añadir o actualizar un usuario
def agregar_o_actualizar_usuario(ratings_csr, movie_mapper, nuevas_valoraciones, user_id=None):
    """
    nuevas_valoraciones: dict {movieId: rating}
    user_id: índice interno (fila en la matriz) o None si es nuevo
    """
    num_peliculas = ratings_csr.shape[1]
    nueva_fila = np.zeros(num_peliculas)

    for mid, rating in nuevas_valoraciones.items():
        if mid in movie_mapper:
            col_idx = movie_mapper[mid]
            nueva_fila[col_idx] = rating
        else:
            print(f"⚠️ movieId {mid} no encontrado en movie_mapper")

    nueva_fila_sparse = csr_matrix(nueva_fila)

    if user_id is None:
        # Crear nuevo usuario
        updated_ratings_csr = vstack([ratings_csr, nueva_fila_sparse])
        user_id = updated_ratings_csr.shape[0] - 1
    else:
        # Actualizar usuario existente
        updated_ratings_csr = ratings_csr.copy()
        updated_ratings_csr[user_id] = nueva_fila_sparse

    return updated_ratings_csr, user_id

In [26]:
# Valoraciones nuevas {movieid: rating}
nuevas_valoraciones = {1: 4.0, 32: 5.0, 589: 3.0}

# Nuevo usuario
ratings_csr, nuevo_user_id = agregar_o_actualizar_usuario(ratings_csr, movie_mapper, nuevas_valoraciones)

# O bien, para actualizar uno ya existente:
# ratings_csr, _ = agregar_o_actualizar_usuario(ratings_csr, movie_mapper, nuevas_valoraciones, user_id=1234)

# Reentrenar KNN
model_knn = entrenar_knn(ratings_csr)

# Obtener vecinos
distancias, vecinos = obtener_vecinos(model_knn, ratings_csr, nuevo_user_id, 10)

print(f"Vecinos del usuario {nuevo_user_id}:")
for i, (vec, dist) in enumerate(zip(vecinos, distancias)):
    print(f"{i+1}. Usuario {vec} — distancia: {dist:.4f}")


Vecinos del usuario 200948:
1. Usuario 200948 — distancia: 0.0000
2. Usuario 200926 — distancia: 0.5438
3. Usuario 102668 — distancia: 0.5604
4. Usuario 161871 — distancia: 0.5647
5. Usuario 22215 — distancia: 0.5659
6. Usuario 98603 — distancia: 0.5674
7. Usuario 129521 — distancia: 0.5698
8. Usuario 20194 — distancia: 0.5794
9. Usuario 38859 — distancia: 0.5817
10. Usuario 49464 — distancia: 0.5826


In [27]:
user_id = 129521
distancias, vecinos = obtener_vecinos(model_knn, ratings_csr, user_id, 10)

print(f"Vecinos del usuario {user_id}:")
for i, (vec, dist) in enumerate(zip(vecinos, distancias)):
    print(f"{i+1}. Usuario {vec} — distancia: {dist:.4f}")

Vecinos del usuario 129521:
1. Usuario 129521 — distancia: 0.0000
2. Usuario 151695 — distancia: 0.2718
3. Usuario 16045 — distancia: 0.2736
4. Usuario 12361 — distancia: 0.2865
5. Usuario 108075 — distancia: 0.2948
6. Usuario 39370 — distancia: 0.2955
7. Usuario 5660 — distancia: 0.3011
8. Usuario 19752 — distancia: 0.3016
9. Usuario 11369 — distancia: 0.3048
10. Usuario 127678 — distancia: 0.3067


### Recomendar peliculas no vistas

In [28]:
def recomendar_peliculas(
    user_id,
    ratings_csr,
    df_movies,
    movie_mapper,
    reverse_movie_mapper,
    model_knn,
    top_k=5,
    top_n=10
):
    # Paso 1: obtener vecinos
    distancias, vecinos = obtener_vecinos(model_knn, ratings_csr, user_id, k=top_k)
    vecinos = [v for v in vecinos if v != user_id]

    # Paso 2: obtener películas vistas por el usuario
    user_ratings = ratings_csr[user_id].toarray().flatten()
    peliculas_vistas = set(np.where(user_ratings > 0)[0])

    # Paso 3: sumar ratings de vecinos por película
    scores = {}
    for vecino_id in vecinos:
        vecino_ratings = ratings_csr[vecino_id].toarray().flatten()
        for idx, rating in enumerate(vecino_ratings):
            if rating > 0 and idx not in peliculas_vistas:
                scores[idx] = scores.get(idx, []) + [rating]

    # Paso 4: obtener promedio de cada película recomendada
    scores_avg = [(idx, np.mean(ratings)) for idx, ratings in scores.items()]

    # Paso 5: ordenar según bayesian_rating (si disponible), sino por promedio
    def get_bayesian(idx):
        movie_id = reverse_movie_mapper[idx]
        row = df_movies[df_movies["movieId"] == movie_id]
        return row["bayesian_rating"].values[0] if "bayesian_rating" in row else 0

    scores_avg.sort(key=lambda x: get_bayesian(x[0]), reverse=True)

    # Paso 6: traducir a títulos
    recomendaciones = []
    for idx, _ in scores_avg[:top_n]:
        movie_id = reverse_movie_mapper[idx]
        row = df_movies[df_movies["movieId"] == movie_id].iloc[0]
        recomendaciones.append((movie_id, row["title"], row.get("bayesian_rating", np.nan)))

    return recomendaciones

In [29]:
user_id = 129521

recs = recomendar_peliculas(
    user_id=nuevo_user_id,
    ratings_csr=ratings_csr,
    df_movies=df_movies,
    movie_mapper=movie_mapper,
    reverse_movie_mapper=reverse_movie_mapper,
    model_knn=model_knn,
    top_k=5,
    top_n=10
)

print("\n🔮 Recomendaciones:")
for movie_id, title, bayes in recs:
    print(f"{title} (movieId: {movie_id}) — bayesian_rating: {bayes:.2f}")


🔮 Recomendaciones:
Interview with the Vampire: The Vampire Chronicles (1994) (movieId: 253) — bayesian_rating: 4.32
Babe (1995) (movieId: 34) — bayesian_rating: 4.19
Executive Decision (1996) (movieId: 494) — bayesian_rating: 4.18
Jurassic Park (1993) (movieId: 480) — bayesian_rating: 4.15
Star Wars: Episode IV - A New Hope (1977) (movieId: 260) — bayesian_rating: 4.11
Silence of the Lambs, The (1991) (movieId: 593) — bayesian_rating: 4.10
Dragonheart (1996) (movieId: 653) — bayesian_rating: 4.09
GoldenEye (1995) (movieId: 10) — bayesian_rating: 4.08
Firm, The (1993) (movieId: 454) — bayesian_rating: 4.05
Sabrina (1995) (movieId: 7) — bayesian_rating: 3.92


### Recomendar en base a genero

In [30]:
def recomendar_por_genero(
    user_id,
    genero_objetivo,
    ratings_csr,
    df_movies,
    movie_mapper,
    reverse_movie_mapper,
    model_knn,
    top_k=5,
    top_n=10
):
    # Paso 1: obtener vecinos
    distancias, vecinos = obtener_vecinos(model_knn, ratings_csr, user_id, k=top_k)
    vecinos = [v for v in vecinos if v != user_id]

    # Paso 2: películas vistas por el usuario
    user_ratings = ratings_csr[user_id].toarray().flatten()
    peliculas_vistas = set(np.where(user_ratings > 0)[0])

    # Paso 3: obtener películas candidatas de vecinos que no haya visto
    scores = {}
    for vecino_id in vecinos:
        vecino_ratings = ratings_csr[vecino_id].toarray().flatten()
        for idx, rating in enumerate(vecino_ratings):
            if rating > 0 and idx not in peliculas_vistas:
                movie_id = reverse_movie_mapper[idx]
                row = df_movies[df_movies["movieId"] == movie_id]
                if not row.empty and genero_objetivo in row["genres"].values[0].split('|'):
                    scores[idx] = scores.get(idx, []) + [rating]

    if not scores:
        return []

    # Paso 4: promedio por película
    scores_avg = [(idx, np.mean(ratings)) for idx, ratings in scores.items()]

    # Paso 5: ordenar por bayesian_rating
    def get_bayesian(idx):
        movie_id = reverse_movie_mapper[idx]
        row = df_movies[df_movies["movieId"] == movie_id]
        return row["bayesian_rating"].values[0] if not row.empty else 0

    scores_avg.sort(key=lambda x: get_bayesian(x[0]), reverse=True)

    # Paso 6: retornar recomendaciones
    recomendaciones = []
    for idx, _ in scores_avg[:top_n]:
        movie_id = reverse_movie_mapper[idx]
        row = df_movies[df_movies["movieId"] == movie_id].iloc[0]
        recomendaciones.append((movie_id, row["title"], row["genres"], row.get("bayesian_rating", np.nan)))

    return recomendaciones

In [31]:
user_id = 129521

recs = recomendar_por_genero(
    user_id=user_id,
    genero_objetivo="Comedy",
    ratings_csr=ratings_csr,
    df_movies=df_movies,
    movie_mapper=movie_mapper,
    reverse_movie_mapper=reverse_movie_mapper,
    model_knn=model_knn,
    top_k=10, #vecinos
    top_n=20 #peliculas
)

print("\n🎭 Recomendaciones de género:")
for movie_id, title, genres, bayes in recs:
    print(f"{title} [{genres}] — bayesian_rating: {bayes:.2f}")


🎭 Recomendaciones de género:
Wallace & Gromit: A Close Shave (1995) [Animation|Children|Comedy] — bayesian_rating: 4.13
Much Ado About Nothing (1993) [Comedy|Romance] — bayesian_rating: 4.09
Ace Ventura: When Nature Calls (1995) [Comedy] — bayesian_rating: 3.98
Clueless (1995) [Comedy|Romance] — bayesian_rating: 3.93
Hudsucker Proxy, The (1994) [Comedy] — bayesian_rating: 3.71
Money Train (1995) [Action|Comedy|Crime|Drama|Thriller] — bayesian_rating: 3.67
Get Shorty (1995) [Comedy|Crime|Thriller] — bayesian_rating: 3.65
While You Were Sleeping (1995) [Comedy|Romance] — bayesian_rating: 3.59
Addams Family Values (1993) [Children|Comedy|Fantasy] — bayesian_rating: 3.57
Sleepless in Seattle (1993) [Comedy|Drama|Romance] — bayesian_rating: 3.52
Mask, The (1994) [Action|Comedy|Crime|Fantasy] — bayesian_rating: 3.50
Dumb & Dumber (Dumb and Dumber) (1994) [Adventure|Comedy] — bayesian_rating: 3.50
Beverly Hills Cop III (1994) [Action|Comedy|Crime|Thriller] — bayesian_rating: 3.20
Forrest Gum

# Directamente desde el modelo

In [1]:
import pickle
import time
import os
import sys
import numpy as np
import pandas as pd
from scipy.sparse import load_npz, vstack, csr_matrix
from sklearn.neighbors import NearestNeighbors


In [2]:
class MovieRecommendationSystem:
    """
    Sistema de recomendación de películas basado en filtrado colaborativo
    usando diferentes métricas de distancia con K-Nearest Neighbors.
    """
    
    def __init__(self, path="./data/processed"):
        """
        Inicializa el sistema de recomendación cargando datos y entrenando modelos.
        
        Args:
            path (str): Ruta a los archivos de datos procesados
        """
        self.path = path
        self.metrics = ['cosine', 'euclidean', 'manhattan']
        self.models = {}
        self.training_times = {}
        
        # Cargar datos
        self._load_data()
        
        # Entrenar modelos con diferentes métricas
        self._train_models()
    
    def _load_data(self):
        """Carga la matriz de ratings, mapeos y DataFrame de películas."""
        # Cargar matriz CSR
        self.ratings_csr = load_npz(f"{self.path}/ratings_csr.npz")
        
        # Cargar mapeos
        with open(f"{self.path}/mappers.pkl", "rb") as f:
            data = pickle.load(f)
            self.user_mapper = data["user_mapper"]
            self.movie_mapper = data["movie_mapper"]
            self.reverse_movie_mapper = data["reverse_movie_mapper"]
        
        # Cargar DataFrame de películas
        self.df_movies = pd.read_csv(f"{self.path}/movies_bayesian.csv")
    
    def _train_models(self):
        """Entrena modelos KNN con diferentes métricas."""
        for metric in self.metrics:
            start_time = time.time()
            
            model = NearestNeighbors(
                metric=metric,
                algorithm='brute',
                n_neighbors=5,
                n_jobs=-1
            )
            model.fit(self.ratings_csr)
            
            end_time = time.time()
            training_time = end_time - start_time
            
            self.models[metric] = model
            self.training_times[metric] = training_time
    
    def _get_object_size_mb(self, obj):
        """Calcula el tamaño en MB de un objeto en memoria."""
        return sys.getsizeof(obj) / (1024 * 1024)
    
    def _get_file_size_mb(self, filepath):
        """Calcula el tamaño en MB de un archivo en disco."""
        try:
            return os.path.getsize(filepath) / (1024 * 1024)
        except FileNotFoundError:
            return 0
    
    def _get_deep_size_mb(self, obj):
        """Calcula el tamaño aproximado en MB incluyendo objetos anidados."""
        size = sys.getsizeof(obj)
        
        if isinstance(obj, dict):
            size += sum(sys.getsizeof(k) + sys.getsizeof(v) for k, v in obj.items())
        elif isinstance(obj, (list, tuple)):
            size += sum(sys.getsizeof(item) for item in obj)
        elif hasattr(obj, '__dict__'):
            size += sum(sys.getsizeof(k) + sys.getsizeof(v) for k, v in obj.__dict__.items())
        
        return size / (1024 * 1024)
    
    def info(self):
        """
        Retorna información completa sobre el sistema: métricas, tiempos de entrenamiento,
        tamaños de datos en memoria y en disco.
        
        Returns:
            dict: Información detallada del sistema
        """
        # Información básica
        info_dict = {
            'metrics': self.metrics,
            'training_times': self.training_times.copy(),
            'dataset_info': {
                'num_users': self.ratings_csr.shape[0],
                'num_movies': self.ratings_csr.shape[1],
                'num_ratings': self.ratings_csr.nnz,
                'sparsity': 1 - (self.ratings_csr.nnz / (self.ratings_csr.shape[0] * self.ratings_csr.shape[1]))
            },
            'models_info': {
                metric: {
                    'n_neighbors': model.n_neighbors,
                    'algorithm': model.algorithm,
                    'metric': model.metric
                } for metric, model in self.models.items()
            }
        }
        
        # Información de memoria RAM (en MB)
        memory_info = {
            'ratings_csr_mb': self._get_object_size_mb(self.ratings_csr),
            'user_mapper_mb': self._get_deep_size_mb(self.user_mapper),
            'movie_mapper_mb': self._get_deep_size_mb(self.movie_mapper),
            'reverse_movie_mapper_mb': self._get_deep_size_mb(self.reverse_movie_mapper),
            'df_movies_mb': self.df_movies.memory_usage(deep=True).sum() / (1024 * 1024),
            'models_mb': {
                metric: self._get_deep_size_mb(model) for metric, model in self.models.items()
            }
        }
        
        # Calcular total de RAM de modelos
        total_models_mb = sum(memory_info['models_mb'].values())
        memory_info['total_models_mb'] = total_models_mb
        
        # Calcular total de RAM del sistema
        total_system_mb = (
            memory_info['ratings_csr_mb'] + 
            memory_info['user_mapper_mb'] + 
            memory_info['movie_mapper_mb'] + 
            memory_info['reverse_movie_mapper_mb'] + 
            memory_info['df_movies_mb'] + 
            total_models_mb
        )
        memory_info['total_system_mb'] = total_system_mb
        
        # Información de archivos en disco (en MB)
        disk_info = {
            'ratings_csr_npz_mb': self._get_file_size_mb(f"{self.path}/ratings_csr.npz"),
            'mappers_pkl_mb': self._get_file_size_mb(f"{self.path}/mappers.pkl"),
            'movies_bayesian_csv_mb': self._get_file_size_mb(f"{self.path}/movies_bayesian.csv")
        }
        
        # Calcular total de archivos en disco
        total_disk_mb = sum(disk_info.values())
        disk_info['total_disk_mb'] = total_disk_mb
        
        # Estadísticas de eficiencia
        efficiency_info = {
            'memory_to_disk_ratio': total_system_mb / total_disk_mb if total_disk_mb > 0 else 0,
            'ratings_compression_ratio': memory_info['ratings_csr_mb'] / disk_info['ratings_csr_npz_mb'] if disk_info['ratings_csr_npz_mb'] > 0 else 0,
            'models_overhead_ratio': total_models_mb / memory_info['ratings_csr_mb'] if memory_info['ratings_csr_mb'] > 0 else 0
        }
        
        # Agregar toda la información al diccionario principal
        info_dict.update({
            'memory_usage_mb': memory_info,
            'disk_usage_mb': disk_info,
            'efficiency_metrics': efficiency_info
        })
        
        return info_dict
    
    def obtener_vecinos(self, user_id, metric='cosine', k=5):
        """
        Obtiene los k vecinos más similares a un usuario usando la métrica especificada.
        
        Args:
            user_id (int): ID del usuario (índice en la matriz)
            metric (str): Métrica a usar ('cosine', 'euclidean', 'manhattan')
            k (int): Número de vecinos a retornar
            
        Returns:
            tuple: (distancias, indices) de los vecinos más similares
        """
        if metric not in self.models:
            raise ValueError(f"Métrica '{metric}' no disponible. Opciones: {self.metrics}")
        
        model = self.models[metric]
        # Temporalmente cambiar n_neighbors si es necesario
        if k != model.n_neighbors:
            temp_model = NearestNeighbors(
                metric=metric,
                algorithm='brute',
                n_neighbors=k,
                n_jobs=-1
            )
            temp_model.fit(self.ratings_csr)
            distancias, indices = temp_model.kneighbors(self.ratings_csr[user_id])
        else:
            distancias, indices = model.kneighbors(self.ratings_csr[user_id])
        
        return distancias[0], indices[0]
    
    def agregar_o_actualizar_usuario(self, nuevas_valoraciones, user_id=None):
        """
        Agrega un nuevo usuario o actualiza uno existente y reentrena todos los modelos.
        
        Args:
            nuevas_valoraciones (dict): {movieId: rating}
            user_id (int, optional): ID del usuario a actualizar. None para nuevo usuario.
            
        Returns:
            tuple: (updated_ratings_csr, user_id, training_times_dict)
        """
        num_peliculas = self.ratings_csr.shape[1]
        nueva_fila = np.zeros(num_peliculas)
        
        # Crear vector de ratings
        for mid, rating in nuevas_valoraciones.items():
            if mid in self.movie_mapper:
                col_idx = self.movie_mapper[mid]
                nueva_fila[col_idx] = rating
        
        nueva_fila_sparse = csr_matrix(nueva_fila)
        
        # Actualizar matriz
        if user_id is None:
            # Crear nuevo usuario
            self.ratings_csr = vstack([self.ratings_csr, nueva_fila_sparse])
            user_id = self.ratings_csr.shape[0] - 1
        else:
            # Actualizar usuario existente
            self.ratings_csr[user_id] = nueva_fila_sparse
        
        # Reentrenar todos los modelos y medir tiempos
        new_training_times = {}
        for metric in self.metrics:
            start_time = time.time()
            
            model = NearestNeighbors(
                metric=metric,
                algorithm='brute',
                n_neighbors=5,
                n_jobs=-1
            )
            model.fit(self.ratings_csr)
            
            end_time = time.time()
            training_time = end_time - start_time
            
            self.models[metric] = model
            new_training_times[metric] = training_time
        
        self.training_times = new_training_times
        
        return self.ratings_csr, user_id, new_training_times
    
    def recomendar_peliculas(self, user_id, metric='cosine', top_k=5, top_n=10):
        """
        Recomienda películas basándose en usuarios similares usando la métrica especificada.
        
        Args:
            user_id (int): ID del usuario
            metric (str): Métrica a usar para encontrar vecinos
            top_k (int): Número de vecinos a considerar
            top_n (int): Número de recomendaciones a retornar
            
        Returns:
            list: Lista de tuplas (movie_id, title, bayesian_rating)
        """
        # Obtener vecinos
        distancias, vecinos = self.obtener_vecinos(user_id, metric, top_k)
        vecinos = [v for v in vecinos if v != user_id]
        
        # Obtener películas vistas por el usuario
        user_ratings = self.ratings_csr[user_id].toarray().flatten()
        peliculas_vistas = set(np.where(user_ratings > 0)[0])
        
        # Sumar ratings de vecinos por película
        scores = {}
        for vecino_id in vecinos:
            vecino_ratings = self.ratings_csr[vecino_id].toarray().flatten()
            for idx, rating in enumerate(vecino_ratings):
                if rating > 0 and idx not in peliculas_vistas:
                    scores[idx] = scores.get(idx, []) + [rating]
        
        # Obtener promedio de cada película recomendada
        scores_avg = [(idx, np.mean(ratings)) for idx, ratings in scores.items()]
        
        # Ordenar según bayesian_rating
        def get_bayesian(idx):
            movie_id = self.reverse_movie_mapper[idx]
            row = self.df_movies[self.df_movies["movieId"] == movie_id]
            return row["bayesian_rating"].values[0] if not row.empty and "bayesian_rating" in row.columns else 0
        
        scores_avg.sort(key=lambda x: get_bayesian(x[0]), reverse=True)
        
        # Traducir a títulos
        recomendaciones = []
        for idx, _ in scores_avg[:top_n]:
            movie_id = self.reverse_movie_mapper[idx]
            row = self.df_movies[self.df_movies["movieId"] == movie_id]
            if not row.empty:
                row = row.iloc[0]
                bayesian_rating = row.get("bayesian_rating", np.nan)
                recomendaciones.append((movie_id, row["title"], bayesian_rating))
        
        return recomendaciones
    
    def recomendar_por_genero(self, user_id, genero_objetivo, metric='cosine', top_k=5, top_n=10):
        """
        Recomienda películas de un género específico basándose en usuarios similares.
        
        Args:
            user_id (int): ID del usuario
            genero_objetivo (str): Género objetivo (ej: 'Action', 'Comedy')
            metric (str): Métrica a usar para encontrar vecinos
            top_k (int): Número de vecinos a considerar
            top_n (int): Número de recomendaciones a retornar
            
        Returns:
            list: Lista de tuplas (movie_id, title, genres, bayesian_rating)
        """
        # Obtener vecinos
        distancias, vecinos = self.obtener_vecinos(user_id, metric, top_k)
        vecinos = [v for v in vecinos if v != user_id]
        
        # Películas vistas por el usuario
        user_ratings = self.ratings_csr[user_id].toarray().flatten()
        peliculas_vistas = set(np.where(user_ratings > 0)[0])
        
        # Obtener películas candidatas de vecinos que no haya visto
        scores = {}
        for vecino_id in vecinos:
            vecino_ratings = self.ratings_csr[vecino_id].toarray().flatten()
            for idx, rating in enumerate(vecino_ratings):
                if rating > 0 and idx not in peliculas_vistas:
                    movie_id = self.reverse_movie_mapper[idx]
                    row = self.df_movies[self.df_movies["movieId"] == movie_id]
                    if not row.empty and genero_objetivo in row["genres"].values[0].split('|'):
                        scores[idx] = scores.get(idx, []) + [rating]
        
        if not scores:
            return []
        
        # Promedio por película
        scores_avg = [(idx, np.mean(ratings)) for idx, ratings in scores.items()]
        
        # Ordenar por bayesian_rating
        def get_bayesian(idx):
            movie_id = self.reverse_movie_mapper[idx]
            row = self.df_movies[self.df_movies["movieId"] == movie_id]
            return row["bayesian_rating"].values[0] if not row.empty and "bayesian_rating" in row.columns else 0
        
        scores_avg.sort(key=lambda x: get_bayesian(x[0]), reverse=True)
        
        # Retornar recomendaciones
        recomendaciones = []
        for idx, _ in scores_avg[:top_n]:
            movie_id = self.reverse_movie_mapper[idx]
            row = self.df_movies[self.df_movies["movieId"] == movie_id]
            if not row.empty:
                row = row.iloc[0]
                bayesian_rating = row.get("bayesian_rating", np.nan)
                recomendaciones.append((movie_id, row["title"], row["genres"], bayesian_rating))
        
        return recomendaciones



In [5]:
sistema = MovieRecommendationSystem("./backend/data/processed")

In [6]:
info = sistema.info()
info

{'metrics': ['cosine', 'euclidean', 'manhattan'],
 'training_times': {'cosine': 0.3145918846130371,
  'euclidean': 0.2625236511230469,
  'manhattan': 0.32501935958862305},
 'dataset_info': {'num_users': 200948,
  'num_movies': 84432,
  'num_ratings': 32000204,
  'sparsity': 0.9981139118693746},
 'models_info': {'cosine': {'n_neighbors': 5,
   'algorithm': 'brute',
   'metric': 'cosine'},
  'euclidean': {'n_neighbors': 5, 'algorithm': 'brute', 'metric': 'euclidean'},
  'manhattan': {'n_neighbors': 5,
   'algorithm': 'brute',
   'metric': 'manhattan'}},
 'memory_usage_mb': {'ratings_csr_mb': 4.57763671875e-05,
  'user_mapper_mb': 21.498424530029297,
  'movie_mapper_mb': 7.331325531005859,
  'reverse_movie_mapper_mb': 7.331325531005859,
  'df_movies_mb': np.float64(15.583353042602539),
  'models_mb': {'cosine': 0.0014200210571289062,
   'euclidean': 0.0014257431030273438,
   'manhattan': 0.0014257431030273438},
  'total_models_mb': 0.004271507263183594,
  'total_system_mb': np.float64(51.

In [7]:
vecinos_cosine = sistema.obtener_vecinos(user_id=100, metric='cosine', k=10)
vecinos_cosine
# Notese que son 2 arrays: distancias e indices

(array([0.        , 0.55106267, 0.55190484, 0.55525065, 0.56612364,
        0.57059942, 0.57984665, 0.5823442 , 0.58408302, 0.58787544]),
 array([   100,  10883,  86902,  57850,   9643, 166345, 184179,  44756,
         42925,  78853]))

In [8]:
nuevas_valoraciones = {1: 4.0, 32: 5.0, 589: 3.0}
ratings_csr, nuevo_user_id, tiempos = sistema.agregar_o_actualizar_usuario(nuevas_valoraciones)
tiempos

{'cosine': 0.11742782592773438,
 'euclidean': 0.10264301300048828,
 'manhattan': 0.11008763313293457}

In [9]:
recomendaciones = sistema.recomendar_peliculas(nuevo_user_id, metric='cosine', top_n=5)
recomendaciones

[(np.int64(253),
  'Interview with the Vampire: The Vampire Chronicles (1994)',
  np.float64(4.315807373985117)),
 (np.int64(34), 'Babe (1995)', np.float64(4.193607022091446)),
 (np.int64(494), 'Executive Decision (1996)', np.float64(4.176147550561679)),
 (np.int64(480), 'Jurassic Park (1993)', np.float64(4.147962405359859)),
 (np.int64(260),
  'Star Wars: Episode IV - A New Hope (1977)',
  np.float64(4.111556510496423))]

In [12]:
recomendaciones_accion = sistema.recomendar_por_genero(nuevo_user_id, 'Action', metric='manhattan', top_n=5)
recomendaciones_accion

[(np.int64(1304),
  'Butch Cassidy and the Sundance Kid (1969)',
  'Action|Western',
  np.float64(4.4021599934240605)),
 (np.int64(2115),
  'Indiana Jones and the Temple of Doom (1984)',
  'Action|Adventure|Fantasy',
  np.float64(4.115548533285113)),
 (np.int64(2723),
  'Mystery Men (1999)',
  'Action|Comedy|Fantasy',
  np.float64(3.838182156641533)),
 (np.int64(2640),
  'Superman (1978)',
  'Action|Adventure|Sci-Fi',
  np.float64(3.6757772234316137)),
 (np.int64(370),
  'Naked Gun 33 1/3: The Final Insult (1994)',
  'Action|Comedy',
  np.float64(3.619066406017865))]