# 01 - Modelado Base

## Descripción
Este notebook tiene como objetivo realizar una carga inicial de los datos, estructurarlos (especialmente datos anidados en formato JSON), y realizar un análisis exploratorio rápido utilizando `pandas-profiling`. Además, se documentarán observaciones y hallazgos simples, y se proporcionará un resumen con los siguientes pasos planeados

# 1 - Dependencias:

### 1.1 Instalación de dependencias

In [1]:
# # NPL Clasic
# !pip install spacy
# # Transformer
# !pip install sentence_transformers
# !pip install faiss-cpu
# #python -m spacy download es_core_news_lg # model large
# !python -m spacy download es_core_news_sm # model small
# !pip install swifter
# !pip install rapidfuzz
!pip install ydata_profiling
!pip install surprise
!pip install optuna

Collecting ydata_profiling
  Downloading ydata_profiling-4.12.2-py2.py3-none-any.whl.metadata (20 kB)
Collecting visions<0.8.0,>=0.7.5 (from visions[type_image_path]<0.8.0,>=0.7.5->ydata_profiling)
  Downloading visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata_profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata_profiling)
  Downloading phik-0.12.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata_profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata_profiling)
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dacite>=1.8 (from ydata_profiling)
  Downloading dacite-1.9.2-py3-none-any.whl.metadata (17 kB)
Collecting PyWavelets (from imagehash==4.3.1->ydata_profiling)
  Downloading pywavelets-1.

### 1.2 Importación de dependencias

In [2]:
# Complementos
import warnings
from pathlib import Path
import gc
import requests

# Visualizacion
import matplotlib.pyplot as plt
import seaborn as sns

# Pre-procesamiento
from ydata_profiling import ProfileReport
from typing import List
import re
import json
import ast
import pandas as pd
import numpy as np
from dateutil.parser import parse

# NPL Clasic
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from wordcloud import WordCloud
from collections import Counter

### 1.3 Conexión a Datasets desde Google Drive

In [3]:
from google.colab import files
from google.colab import drive

### 1.4 Constantes

# 2 - Funciones:

### 2.1 Funciones de procesamiento inicial datasets

In [5]:
def clean_data_init(df: pd.DataFrame, path: str, file_name: str):
    """
    Limpia un df de pandas, eliminando estructuras anidadas,
    columnas irrelevantes y optimizando tipos de datos.

    Args:
        df (pd.DataFrame): df original a limpiar.
        path (str): Directorio donde se guardará el reporte de pandas-profiling.
        file_name (str): Nombre del archivo para el reporte HTML.

    Returns:
        Tuple[pd.DataFrame, ProfileReport]: df limpio y reporte de perfilado.
    """

    def flatten_json(value, prefix=''):
        # Convierte estructuras anidadas ej. 'JSON, listas' en columnas planas
        if isinstance(value, str):
            try:
                value = ast.literal_eval(value)
            except (ValueError, SyntaxError):
                return {prefix: value}

        if isinstance(value, dict):
            return {f'{prefix}_{k}': v for k, v in value.items()} or {prefix: np.nan}

        if isinstance(value, list):
            if not value:
                return {prefix: np.nan}
            elif all(isinstance(i, dict) for i in value):
                return {f'{prefix}_{k}': v for item in value for k, v in item.items()}
            return {f'{prefix}_{i}': v for i, v in enumerate(value)}

        return {prefix: value}

    # Aplica transformacion
    df_expanded = pd.concat(
        [pd.json_normalize(df[col].apply(lambda x: flatten_json(x, col))) for col in df.columns],
        axis=1
    )

    # # Elimina columnas con 'id', 'url' o 'permalink'
    # df_expanded.columns = df_expanded.columns.astype(str)
    # df_clean = df_expanded.loc[:, ~df_expanded.columns.str.contains(r'\b(id|url|permalink)\b', case=False, regex=True)]

    # Convierte tipos de datos validando fechas
    df_clean = df_expanded.copy()
    for col in df_clean.select_dtypes(include=['object']):
        try:
            sample_values = df_clean[col].dropna().sample(n=min(10, len(df_clean[col].dropna())), random_state=42).tolist()
            if all(isinstance(parse(val, fuzzy=True), pd.Timestamp) for val in sample_values):
                df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce', infer_datetime_format=True)
        except Exception:
            continue

    # Genera reporte de pandas-profiling
    output_path = Path(path) / f'{file_name}.html'
    try:
        profile = ProfileReport(df_clean, explorative=True)
        profile.to_file(output_path)
    except Exception as e:
        print(f"No se pudo generar el reporte de perfilado: {e}")
        profile = None

    return df_clean.reset_index(drop=True), profile

# 3 - Carga de Datos:

In [25]:
# directorio base
drive.mount('/content/drive') # util cuando se tienen los datos en almacenamiento de drive
PATH_GLOBAL = Path('/content/drive/MyDrive/Pruebas/Prueba_Compensar_Data_Science/User-Centric-Recommender-Engine/') # ajustar
PATH_DATASET = PATH_GLOBAL / 'data'
PATH_REPORT = PATH_GLOBAL / 'reports'
PATH_IMG = PATH_GLOBAL / 'img'
PATH_MODELS = PATH_GLOBAL / 'models'

In [7]:
print("Loading dataset...")
users = pd.read_csv(PATH_DATASET / 'users.csv')
products = pd.read_csv(PATH_DATASET / 'products.csv', sep=';')
interactions = pd.read_csv(PATH_DATASET / 'interactions.csv')

Loading dataset...


In [8]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           50000 non-null  int64  
 1   product_id        50000 non-null  int64  
 2   tipo_interaccion  50000 non-null  object 
 3   rating            10324 non-null  float64
 4   comentario        10436 non-null  object 
 5   timestamp         50000 non-null  object 
 6   metodo_pago       50000 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 2.7+ MB


In [None]:
users.user_id.nunique(), products.product_id.nunique(), interactions.user_id.nunique(), interactions.product_id.nunique()

(5000, 2000, 4999, 1999)

In [98]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore", UserWarning)
    warnings.simplefilter("ignore", SyntaxWarning)
    warnings.simplefilter("ignore", FutureWarning)
    products, report_prod = clean_data_init(products, PATH_REPORT, 'report_products')
    users, report_user = clean_data_init(users, PATH_REPORT, 'report_users')
    interactions, report_interact = clean_data_init(interactions, PATH_REPORT, 'report_interactions')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 31.9 s, sys: 12.3 s, total: 44.1 s
Wall time: 43.8 s


In [None]:
# ----------------------
# PERFILAMIENTO DE DATOS
# ----------------------
def profile_data(df, name):
    print(f"\n===== Perfilamiento de {name} =====")
    print(df.info())
    print("\nValores Nulos:\n", df.isnull().sum())
    print("\nValores Únicos:\n", df.nunique())
    print("\nEjemplo de Datos:\n", df.head())

profile_data(users, "Usuarios")
profile_data(products, "Productos")
profile_data(interactions, "Interacciones")


===== Perfilamiento de Usuarios =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_id            5000 non-null   int64 
 1   edad               5000 non-null   int64 
 2   genero             5000 non-null   object
 3   nivel_ingresos     5000 non-null   object
 4   nivel_educativo    5000 non-null   object
 5   intereses          5000 non-null   object
 6   tipo_suscripcion   5000 non-null   object
 7   categoria_cliente  5000 non-null   object
 8   ubicacion          5000 non-null   object
 9   dispositivo        5000 non-null   object
 10  frecuencia_login   5000 non-null   object
dtypes: int64(2), object(9)
memory usage: 429.8+ KB
None

Valores Nulos:
 user_id              0
edad                 0
genero               0
nivel_ingresos       0
nivel_educativo      0
intereses            0
tipo_suscripcion     0
categori

# 4 - Desarrollo de Modelos

## 4.1 - Modelo Filtrado Basado en Contenido:

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
import nltk
import json
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words_spanish = stopwords.words('spanish')


# ---- 1️⃣ Cargar y procesar datos ----
def load_and_clean_data():
    # Cargar datos
    print("Loading dataset...")
    users = pd.read_csv(PATH_DATASET / 'users.csv')
    products = pd.read_csv(PATH_DATASET / 'products.csv', sep=';')
    interactions = pd.read_csv(PATH_DATASET / 'interactions.csv')

    # Convertir timestamp a datetime
    interactions['timestamp'] = pd.to_datetime(interactions['timestamp'])

    # Seleccionar solo productos con stock disponible
    #products = products[products['stock_actual'] > 0]

    return users, products, interactions

print("Cleaning dataset...")
users, products, interactions = load_and_clean_data()

# ---- 2️⃣ Procesar características ----
def process_text_features(products):
    tfidf = TfidfVectorizer(max_features=1000, stop_words=stop_words_spanish)
    tfidf_matrix = tfidf.fit_transform(products['palabras_clave'].fillna(''))
    return tfidf_matrix

def process_numerical_features(products):
    scaler = MinMaxScaler()
    numerical_features = products[['precio', 'descuento_aplicado', 'stock_actual', 'rating_promedio']]
    return scaler.fit_transform(numerical_features)

def process_category_onehot(products):
    encoder = OneHotEncoder(handle_unknown='ignore')
    return encoder.fit_transform(products[['category']].fillna(''))

# ---- 3️⃣ Función para recomendar productos ----
def recommend_products_for_user(user_id, top_n=5):
    user_interactions = interactions[interactions['user_id'] == user_id]
    if user_interactions.empty:
        return []  # Usuario sin interacciones previas

    # Tomar el último producto que vio
    last_product_id = user_interactions.sort_values(by='timestamp', ascending=False).iloc[0]['product_id']

    # Obtener índice del producto en la matriz
    idx = products.index[products['product_id'] == last_product_id].tolist()[0]
    similarity_scores = list(enumerate(similarity_matrix[idx]))

    # Ordenar por similitud
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    #products = products[products['stock_actual'] > 0]
    recommended_products = [products.iloc[i[0]] for i in similarity_scores[1:top_n+1]]

    return recommended_products

def recommend_products_for_user_v2(user_id, top_n=5):
    user_interactions = interactions[interactions['user_id'] == user_id]
    if user_interactions.empty:
        print(f"❌ Usuario {user_id} no tiene interacciones.")
        return []

    # Último producto visto
    last_product_id = user_interactions.sort_values(by='timestamp', ascending=False).iloc[0]['product_id']
    print(f"🟢 Último producto visto: {last_product_id}")

    # Buscar en la lista de productos
    product_idx = products.index[products['product_id'] == last_product_id].tolist()
    print(f"🔎 Índices encontrados: {product_idx}")
    if not product_idx:
        return []  # No hay coincidencia en productos

    idx = product_idx[0]

    similarity_scores = list(enumerate(similarity_matrix[idx]))

    # Ordenar por similitud y seleccionar los mejores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    recommended_products = [products.iloc[i[0]] for i in similarity_scores[1:top_n+1]]

    return recommended_products

  # Generar matriz de similitud
tfidf_matrix = process_text_features(products)
numerical_features = process_numerical_features(products)
category_features = process_category_onehot(products)
feature_matrix = hstack([tfidf_matrix, numerical_features, category_features])
similarity_matrix = cosine_similarity(feature_matrix)

user_id = 11
recommended_products = recommend_products_for_user_v2(user_id, top_n=5)

recommendations = [
    {
        "product_id": int(row["product_id"]),  # Convertir a int nativo
        "name": str(row["name"]),
        "category": str(row["category"])
    }
    for row in recommended_products  # Iterar directamente sobre la lista
]

# Estructurar el JSON
output = {
    "user_id": user_id,
    "recommendations": recommendations
}

# Imprimir en formato JSON
print(json.dumps(output, indent=4, ensure_ascii=False))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaning dataset...
Loading dataset...
🟢 Último producto visto: 1626
🔎 Índices encontrados: [1625]
{
    "user_id": 11,
    "recommendations": [
        {
            "product_id": 1131,
            "name": "Spa y Masajes",
            "category": "Salud"
        },
        {
            "product_id": 1068,
            "name": "Spa y Masajes",
            "category": "Salud"
        },
        {
            "product_id": 513,
            "name": "Monitor de Ritmo Cardíaco",
            "category": "Salud"
        },
        {
            "product_id": 756,
            "name": "Spa y Masajes",
            "category": "Salud"
        },
        {
            "product_id": 96,
            "name": "Clases de Yoga Online",
            "category": "Salud"
        }
    ]
}


In [18]:
import numpy as np
from sklearn.model_selection import train_test_split


# ---- 1️⃣ Funciones de Métricas ----
def apk(actual, predicted, k=5):
    """Calcula Average Precision at K (AP@K)."""
    if not actual:
        return 0.0

    predicted = predicted[:k]
    score, num_hits = 0.0, 0.0

    for i, p in enumerate(predicted, 1):
        if p in actual:
            num_hits += 1.0
            score += num_hits / i  # Precisión acumulada en cada punto relevante

    return score / min(len(actual), k)


def mapk(actual_list, predicted_list, k=5):
    """Calcula Mean Average Precision at K (MAP@K)."""
    return np.mean([apk(actual, predicted, k) for actual, predicted in zip(actual_list, predicted_list)])


def dcg_at_k(relevance, k=5):
    """Calcula Discounted Cumulative Gain (DCG@K)."""
    return sum((rel / np.log2(idx + 2)) for idx, rel in enumerate(relevance[:k]))


def ndcg_at_k(actual, predicted, k=5):
    """Calcula Normalized Discounted Cumulative Gain (NDCG@K)."""
    ideal_relevance = sorted(actual, reverse=True)  # Orden ideal de relevancia
    return dcg_at_k(actual, k) / (dcg_at_k(ideal_relevance, k) + 1e-10)


def hit_rate_at_k(actual, predicted, k=5):
    """Calcula Hit Rate (HR@K), mide si al menos un ítem relevante aparece en el Top-K."""
    return 1 if any(p in actual for p in predicted[:k]) else 0


def mean_reciprocal_rank(actual_list, predicted_list):
    """Calcula Mean Reciprocal Rank (MRR)."""
    reciprocal_ranks = [
        1 / (predicted.index(a) + 1) if a in predicted else 0
        for actual, predicted in zip(actual_list, predicted_list) for a in actual
    ]
    return np.mean(reciprocal_ranks)


# ---- 2️⃣ Función de Evaluación ----
def evaluate_recommendation_model(recommend_function, interactions, products, test_size=0.2, k=5):
    """
    Evalúa el modelo de recomendación usando MAP@K, NDCG@K, HR@K y MRR.

    :param recommend_function: Función que genera recomendaciones para un usuario.
    :param interactions: DataFrame con interacciones de usuario-producto.
    :param products: DataFrame con productos.
    :param test_size: Proporción del dataset reservada para pruebas (default: 20%).
    :param k: Número de recomendaciones a evaluar.
    :return: Diccionario con las métricas de evaluación.
    """
    train_data, test_data = train_test_split(interactions, test_size=test_size, random_state=42)

    actual_items = []
    predicted_items = []

    for user_id in test_data["user_id"].unique():
        actual = test_data[test_data["user_id"] == user_id]["product_id"].tolist()
        predicted = recommend_products_for_user(user_id, top_n=k)
        predicted = [p["product_id"] for p in predicted]  # Extraer product_id

        if actual:
            actual_items.append(actual)
            predicted_items.append(predicted)

    # Cálculo de métricas
    metrics = {
        "MAP@K": mapk(actual_items, predicted_items, k),
        "NDCG@K": np.mean([ndcg_at_k(actual, predicted, k) for actual, predicted in zip(actual_items, predicted_items)]),
        "HR@K": np.mean([hit_rate_at_k(actual, predicted, k) for actual, predicted in zip(actual_items, predicted_items)]),
        "MRR": mean_reciprocal_rank(actual_items, predicted_items),
    }

    return metrics


# ---- 3️⃣ Uso de la Evaluación ----
# Aquí evaluamos la función `recommend_products_for_user` ya implementada.
metrics_results = evaluate_recommendation_model(recommend_products_for_user, interactions, products, k=5)
print(json.dumps(metrics_results, indent=4, ensure_ascii=False))

{
    "MAP@K": 0.0011399214726429045,
    "NDCG@K": 0.9287890037209244,
    "HR@K": 0.0052776502983019734,
    "MRR": 0.00126
}


## 4.2 - Modelo Filtrado Colaborativo

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
import nltk
import json
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import coo_matrix

nltk.download('stopwords')
stop_words_spanish = stopwords.words('spanish')


# ---- 1️⃣ Cargar y procesar datos ----
def load_and_clean_data():
    # Cargar datos
    print("Loading dataset...")
    users = pd.read_csv(PATH_DATASET / 'users.csv')
    products = pd.read_csv(PATH_DATASET / 'products.csv', sep=';')
    interactions = pd.read_csv(PATH_DATASET / 'interactions.csv')

    # Convertir timestamp a datetime
    interactions['timestamp'] = pd.to_datetime(interactions['timestamp'])

    # Seleccionar solo productos con stock disponible
    #products = products[products['stock_actual'] > 0]

    return users, products, interactions

users, products, interactions = load_and_clean_data()

def build_user_product_matrix(interactions, products):
    print("🔹 Creando matriz usuario-producto...")

    # Filtrar interacciones válidas
    interactions = interactions.dropna(subset=['user_id', 'product_id'])

    # Asegurar que los productos en interacciones están en products
    interactions = interactions[interactions['product_id'].isin(products['product_id'])]

    # Crear índices de usuarios y productos
    user_idx = {user: idx for idx, user in enumerate(interactions['user_id'].unique())}
    product_idx = {product: idx for idx, product in enumerate(products['product_id'].unique())}

    # Verificar dimensiones
    if len(user_idx) == 0 or len(product_idx) == 0:
        raise ValueError("Los índices de usuario o producto están vacíos.")

    # Obtener coordenadas y valores
    rows = interactions['user_id'].map(user_idx).dropna().astype(int).values
    cols = interactions['product_id'].map(product_idx).dropna().astype(int).values
    data = np.ones(len(rows))  # Presencia de interacción

    # Validación de tamaños
    if len(rows) != len(cols):
        raise ValueError(f"Tamaño inconsistente: filas={len(rows)}, columnas={len(cols)}")

    # Construir matriz dispersa
    user_product_matrix = coo_matrix((data, (rows, cols)), shape=(len(user_idx), len(product_idx)))

    return user_product_matrix, user_idx, product_idx, products[products['product_id'].isin(interactions['product_id'])]

# Llamar función
user_product_matrix, user_idx, product_idx, available_products = build_user_product_matrix(interactions, products)

# ---- 3️⃣ Modelo de Filtrado Colaborativo (kNN) ----
def train_knn_model(user_product_matrix):
    print("🔹 Entrenando modelo kNN...")
    model = NearestNeighbors(metric="cosine", algorithm="brute")
    model.fit(user_product_matrix)
    return model

knn_model = train_knn_model(user_product_matrix)

# ---- 4️⃣ Recomendación de Productos ----
def recommend_products_for_user_fc(user_id, top_n=5):
    if user_id not in user_idx:
        print(f"❌ Usuario {user_id} no encontrado en la matriz.")
        return []

    # Convertir matriz a CSR para permitir indexación
    user_product_matrix_csr = user_product_matrix.tocsr()

    # Obtener vector de interacciones del usuario
    user_vector = user_product_matrix_csr[user_idx[user_id]]

    # Encontrar usuarios similares con kNN
    distances, neighbors = knn_model.kneighbors(user_vector.toarray(), n_neighbors=top_n+1)

    # Obtener recomendaciones de productos
    recommended_products = []
    for neighbor in neighbors[0][1:]:  # Omitir el propio usuario
        similar_user_id = list(user_idx.keys())[list(user_idx.values()).index(neighbor)]
        user_interactions = interactions[interactions['user_id'] == similar_user_id]
        recommended_products.extend(user_interactions['product_id'].tolist())

    # Filtrar productos únicos y limitar cantidad
    recommended_products = list(set(recommended_products))[:top_n]

    return products[products['product_id'].isin(recommended_products)]



# ---- 5️⃣ Generar Salida JSON ----
user_id = 11
recommended_products = recommend_products_for_user_fc(user_id, top_n=5)

recommendations = [
    {
        "product_id": int(row["product_id"]),
        "name": str(row["name"]),
        "category": str(row["category"])
    }
    for _, row in recommended_products.iterrows()
]

output = {
    "user_id": user_id,
    "recommendations": recommendations
}

print(json.dumps(output, indent=4, ensure_ascii=False))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading dataset...
🔹 Creando matriz usuario-producto...
🔹 Entrenando modelo kNN...
{
    "user_id": 11,
    "recommendations": [
        {
            "product_id": 386,
            "name": "Rutina de Ejercicios Personalizada",
            "category": "Deportes"
        },
        {
            "product_id": 1152,
            "name": "Curso de Mindfulness",
            "category": "Bienestar Mental"
        },
        {
            "product_id": 1415,
            "name": "Vitaminas y Suplementos",
            "category": "Nutrición"
        },
        {
            "product_id": 1669,
            "name": "Curso de Meditación",
            "category": "Bienestar Mental"
        },
        {
            "product_id": 1671,
            "name": "Clases de Yoga Online",
            "category": "Salud"
        }
    ]
}


In [97]:
keyword_to_interest = {
    "mindfulness": "bienestar mental",
    "terapia": "bienestar mental",
    "relajación": "bienestar mental",
    "ejercicio": "deportes",
    "entrenamiento": "deportes",
    "rendimiento": "deportes",
    "deporte": "deportes",
    "desarrollo": "desarrollo personal",
    "crecimiento": "desarrollo personal",
    "familia": "familia",
    "juguetes": "familia",
    "bienestar": "familia",
    "mascotas": "mascotas",
    "cuidado": "salud",
    "salud": "salud",
    "dieta": "nutrición",
    "alimentación": "nutrición",
    "suplementos": "nutrición",
    "fitness": "nutrición",
}

def process_user_features(users, interactions):
    mapeo_frecuencia = {'Diaria': 1, 'Semanal': 1/7, 'Mensual': 1/30}
    users['frecuencia_login'] = users['frecuencia_login'].map(mapeo_frecuencia)

    mapeo_nivel_ingresos = {'Alto': 3, 'Medio': 2, 'Bajo': 1}
    users['nivel_ingresos'] = users['nivel_ingresos'].map(mapeo_nivel_ingresos)

    encoder = OneHotEncoder(sparse_output=False)
    user_features = users[['genero', 'tipo_suscripcion', 'categoria_cliente', 'ubicacion', 'dispositivo', 'nivel_educativo']]
    encoded_features = encoder.fit_transform(user_features)

    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(users[['edad', 'nivel_ingresos', 'frecuencia_login']])

    user_features_matrix = np.hstack([scaled_features, encoded_features])
    user_features_df = pd.DataFrame(user_features_matrix, index=users['user_id'])
    user_features_df = user_features_df.loc[interactions['user_id'].unique()]

    vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
    user_interest_matrix = vectorizer.fit_transform(users['intereses'])
    user_interest_df = pd.DataFrame(user_interest_matrix.toarray(),
                                    index=users['user_id'],
                                    columns=vectorizer.get_feature_names_out())

    return user_features_df, user_interest_df

def map_product_keywords_to_interests(product_vector):
    interest_vector = {interest: 0 for interest in set(keyword_to_interest.values())}
    for keyword, value in product_vector.items():
        if value == 1 and keyword in keyword_to_interest:
            interest_vector[keyword_to_interest[keyword]] += 1
    return pd.Series(interest_vector)

def build_user_product_matrix(interactions, products, user_features, user_interest_df):
    user_idx = {user: idx for idx, user in enumerate(interactions['user_id'].unique())}
    product_idx = {product: idx for idx, product in enumerate(products['product_id'].unique())}

    interactions['rating'] = interactions.apply(lambda row: row['rating'] if row['tipo_interaccion'] == 'Valoracion' else 1, axis=1)

    rows = interactions['user_id'].map(user_idx).dropna().astype(int).values
    cols = interactions['product_id'].map(product_idx).dropna().astype(int).values
    data = interactions['rating'].values

    user_product_matrix = coo_matrix((data, (rows, cols)), shape=(len(user_idx), len(product_idx)))
    user_feature_matrix = csr_matrix(user_features.loc[user_idx.keys()].values)

    vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
    product_matrix = vectorizer.fit_transform(products['palabras_clave'])
    product_df = pd.DataFrame(product_matrix.toarray(),
                              index=products['product_id'],
                              columns=vectorizer.get_feature_names_out())

    product_interest_df = product_df.apply(map_product_keywords_to_interests, axis=1).fillna(0)

    full_user_matrix = hstack([user_product_matrix.tocsr(), user_feature_matrix])

    return full_user_matrix, user_idx, product_idx, product_interest_df

def train_knn_model(user_product_matrix):
    # Convierte a array denso y reemplaza NaN con 0
    dense_matrix = user_product_matrix.toarray()
    dense_matrix = np.nan_to_num(dense_matrix)

    print(f"Cantidad de NaN en matriz antes de entrenar: {np.isnan(dense_matrix).sum()}")  # Depuración

    model = NearestNeighbors(metric="cosine", algorithm="brute")
    model.fit(dense_matrix)
    return model


def recommend_products_by_interest(user_id, user_interest_df, product_interest_df, products, top_n=5):
    if user_id not in user_interest_df.index:
        return []

    user_idx = user_interest_df.index.get_loc(user_id)
    similarity_matrix = cosine_similarity(user_interest_df, product_interest_df)
    user_similarities = similarity_matrix[user_idx]
    recommended_product_indices = np.argsort(user_similarities)[::-1][:top_n]

    return products.iloc[recommended_product_indices][['product_id', 'name', 'category']]

users, products, interactions = load_and_clean_data()
user_features, user_interest_df = process_user_features(users, interactions)
user_product_matrix, user_idx, product_idx, product_interest_df = build_user_product_matrix(interactions, products, user_features, user_interest_df)
knn_model = train_knn_model(user_product_matrix)

user_id = 11
interest_recommendations = recommend_products_by_interest(user_id, user_interest_df, product_interest_df, products, top_n=5)

final_recommendations = interest_recommendations.drop_duplicates().head(5)
recommendations = [{"product_id": int(row["product_id"]), "name": str(row["name"]), "category": str(row["category"])} for _, row in final_recommendations.iterrows()]
output = {"user_id": user_id, "recommendations": recommendations}

print(json.dumps(output, indent=4, ensure_ascii=False))


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


Cantidad de NaN en matriz antes de entrenar: 0
{
    "user_id": 11,
    "recommendations": [
        {
            "product_id": 1000,
            "name": "Vitaminas y Suplementos",
            "category": "Nutrición"
        },
        {
            "product_id": 997,
            "name": "Curso de Meditación",
            "category": "Bienestar Mental"
        },
        {
            "product_id": 987,
            "name": "Curso de Meditación",
            "category": "Bienestar Mental"
        },
        {
            "product_id": 988,
            "name": "Curso de Meditación",
            "category": "Bienestar Mental"
        },
        {
            "product_id": 990,
            "name": "Curso de Meditación",
            "category": "Bienestar Mental"
        }
    ]
}


### Intereses

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorizar los intereses de los usuarios
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
user_interest_matrix = vectorizer.fit_transform(users['intereses'])

# Convertir a DataFrame con índices de user_id
user_interest_df = pd.DataFrame(user_interest_matrix.toarray(),
                                index=users['user_id'],
                                columns=vectorizer.get_feature_names_out())
user_interest_df



Unnamed: 0_level_0,bienestar mental,deportes,desarrollo personal,familia,mascotas,nutrición,salud
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0
3,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0
5,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...
4996,0,1,0,0,0,0,0
4997,0,0,0,0,0,1,0
4998,1,0,1,0,0,0,0
4999,0,0,1,0,0,1,0


In [81]:
product_matrix = vectorizer.fit_transform(products['palabras_clave'])
product_df = pd.DataFrame(product_matrix.toarray(),
                                index=products['product_id'],
                                columns=vectorizer.get_feature_names_out())
product_df



Unnamed: 0_level_0,alimentación,bienestar,cuidado,deporte,dieta,ejercicio,fitness,juguetes,mascotas,mindfulness,relajación,rendimiento,salud,suplementos,terapia
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1
2,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1
4,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0
5,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1996,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1
1997,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0
1998,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
1999,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0


In [80]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ---- 1️⃣ Mapeo de palabras clave de productos a intereses de usuarios ----
keyword_to_interest = {
    "mindfulness": "bienestar mental",
    "terapia": "bienestar mental",
    "relajación": "bienestar mental",

    "ejercicio": "deportes",
    "entrenamiento": "deportes",
    "rendimiento": "deportes",
    "deporte": "deportes",

    "desarrollo": "desarrollo personal",
    "crecimiento": "desarrollo personal",

    "familia": "familia",
    "juguetes": "familia",
    "bienestar": "familia",

    "mascotas": "mascotas",

    "cuidado": "salud",
    "salud": "salud",

    "dieta": "nutrición",
    "alimentación": "nutrición",
    "suplementos": "nutrición",
    "fitness": "nutrición",



}

# ---- 2️⃣ Transformar matriz de productos ----
def map_product_keywords_to_interests(product_vector):
    """Convierte la matriz de palabras clave en una matriz de intereses de usuarios."""
    interest_vector = {interest: 0 for interest in set(keyword_to_interest.values())}

    for keyword, value in product_vector.items():
        if value == 1 and keyword in keyword_to_interest:
            interest_vector[keyword_to_interest[keyword]] += 1  # Sumar si hay match

    return pd.Series(interest_vector)

# Crear vectorizador y transformar los productos
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
product_matrix = vectorizer.fit_transform(products['palabras_clave'])

product_df = pd.DataFrame(product_matrix.toarray(),
                          index=products['product_id'],
                          columns=vectorizer.get_feature_names_out())

# Aplicar mapeo para obtener la matriz con intereses de usuarios
product_interest_df = product_df.apply(map_product_keywords_to_interests, axis=1).fillna(0)
product_interest_df



Unnamed: 0_level_0,deportes,salud,familia,desarrollo personal,nutrición,bienestar mental,mascotas
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,0,0,0,0,3,0
2,2,0,0,0,1,0,0
3,0,0,0,0,0,3,0
4,2,0,0,0,1,0,0
5,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...
1996,0,0,0,0,0,3,0
1997,2,0,0,0,1,0,0
1998,1,1,1,0,0,0,0
1999,1,1,1,0,0,0,0


In [91]:
products.columns

Index(['product_id', 'name', 'category', 'descripcion', 'palabras_clave',
       'precio', 'rating_promedio', 'descuento_aplicado', 'stock_actual'],
      dtype='object')

In [92]:
products[['product_id', 'rating_promedio']]

Unnamed: 0,product_id,rating_promedio
0,1,3.0
1,2,2.5
2,3,5.0
3,4,3.0
4,5,2.5
...,...,...
1995,1996,4.0
1996,1997,2.5
1997,1998,4.5
1998,1999,4.0


In [None]:
interactions

In [82]:

# ---- 3️⃣ Crear matriz de intereses de usuarios ----
user_vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
user_interest_matrix = user_vectorizer.fit_transform(users['intereses'])

user_interest_df = pd.DataFrame(user_interest_matrix.toarray(),
                                index=users['user_id'],
                                columns=user_vectorizer.get_feature_names_out())

# ---- 4️⃣ Calcular similitudes y recomendar productos ----
keyword_to_interest = {
    "mindfulness": "bienestar mental",
    "terapia": "bienestar mental",
    "relajación": "bienestar mental",

    "ejercicio": "deportes",
    "entrenamiento": "deportes",
    "rendimiento": "deportes",
    "deporte": "deportes",

    "desarrollo": "desarrollo personal",
    "crecimiento": "desarrollo personal",

    "familia": "familia",
    "juguetes": "familia",
    "bienestar": "familia",

    "mascotas": "mascotas",

    "cuidado": "salud",
    "salud": "salud",

    "dieta": "nutrición",
    "alimentación": "nutrición",
    "suplementos": "nutrición",
    "fitness": "nutrición",



}

# ---- 2️⃣ Transformar matriz de productos ----
def map_product_keywords_to_interests(product_vector):
    """Convierte la matriz de palabras clave en una matriz de intereses de usuarios."""
    interest_vector = {interest: 0 for interest in set(keyword_to_interest.values())}

    for keyword, value in product_vector.items():
        if value == 1 and keyword in keyword_to_interest:
            interest_vector[keyword_to_interest[keyword]] += 1  # Sumar si hay match

    return pd.Series(interest_vector)

# Crear vectorizador y transformar los productos
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
product_matrix = vectorizer.fit_transform(products['palabras_clave'])

product_df = pd.DataFrame(product_matrix.toarray(),
                          index=products['product_id'],
                          columns=vectorizer.get_feature_names_out())

# Aplicar mapeo para obtener la matriz con intereses de usuarios
product_interest_df = product_df.apply(map_product_keywords_to_interests, axis=1).fillna(0)
similarity_matrix = cosine_similarity(user_interest_df, product_interest_df)

def recommend_products_by_interest(user_id, top_n=5):
    if user_id not in user_interest_df.index:
        return []

    user_idx = user_interest_df.index.get_loc(user_id)

    # Obtener similitudes del usuario con los productos
    user_similarities = similarity_matrix[user_idx]

    # Ordenar productos por similitud
    recommended_product_indices = np.argsort(user_similarities)[::-1][:top_n]
    recommended_products = products.iloc[recommended_product_indices]

    return recommended_products[['product_id', 'name', 'category']]

# ---- 5️⃣ Prueba de Recomendación ----
user_id = 11
recommended_products = recommend_products_by_interest(user_id, top_n=5)

# Formatear salida JSON
recommendations = [{"product_id": int(row["product_id"]), "name": str(row["name"]), "category": str(row["category"])}
                   for _, row in recommended_products.iterrows()]

output = {"user_id": user_id, "recommendations": recommendations}
print(output)




{'user_id': 11, 'recommendations': [{'product_id': 1000, 'name': 'Vitaminas y Suplementos', 'category': 'Nutrición'}, {'product_id': 997, 'name': 'Curso de Meditación', 'category': 'Bienestar Mental'}, {'product_id': 987, 'name': 'Curso de Meditación', 'category': 'Bienestar Mental'}, {'product_id': 988, 'name': 'Curso de Meditación', 'category': 'Bienestar Mental'}, {'product_id': 990, 'name': 'Curso de Meditación', 'category': 'Bienestar Mental'}]}


In [65]:
products['palabras_clave']

Unnamed: 0,palabras_clave
0,"Mindfulness, Relajación, Terapia"
1,"Rendimiento, Deporte, Fitness"
2,"Relajación, Terapia, Mindfulness"
3,"Deporte, Fitness, Rendimiento"
4,"Mascotas, Juguetes, Cuidado"
...,...
1995,"Mindfulness, Terapia, Relajación"
1996,"Deporte, Rendimiento, Fitness"
1997,"Ejercicio, Bienestar, Salud"
1998,"Bienestar, Ejercicio, Salud"


In [61]:
# ---- 1️⃣ Cargar datos ----
users, products, interactions = load_and_clean_data()

# ---- 2️⃣ Procesar variables de usuario ----
def process_user_features(users, interactions):
    mapeo_frecuencia = {'Diaria': 1, 'Semanal': 1/7, 'Mensual': 1/30}
    users['frecuencia_login'] = users['frecuencia_login'].map(mapeo_frecuencia)

    mapeo_nivel_ingresos = {'Alto': 3, 'Medio': 2, 'Bajo': 1}
    users['nivel_ingresos'] = users['nivel_ingresos'].map(mapeo_nivel_ingresos)

    encoder = OneHotEncoder(sparse_output=False)
    user_features = users[['genero', 'tipo_suscripcion', 'categoria_cliente', 'ubicacion', 'dispositivo', 'nivel_educativo']]
    encoded_features = encoder.fit_transform(user_features)
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(users[['edad', 'nivel_ingresos', 'frecuencia_login']])

    # Convertir a DataFrame y unir con el user_id
    user_features_matrix = np.hstack([scaled_features, encoded_features])
    user_features_df = pd.DataFrame(user_features_matrix, index=users['user_id'])

    # Filtrar solo los usuarios que están en interacciones
    user_features_df = user_features_df.loc[interactions['user_id'].unique()]

    return user_features_df

user_features = process_user_features(users, interactions)

# ---- 3️⃣ Construir matriz usuario-producto ----
def build_user_product_matrix(interactions, products, user_features):
    user_idx = {user: idx for idx, user in enumerate(interactions['user_id'].unique())}
    product_idx = {product: idx for idx, product in enumerate(products['product_id'].unique())}

    rows = interactions['user_id'].map(user_idx).dropna().astype(int).values
    cols = interactions['product_id'].map(product_idx).dropna().astype(int).values
    data = interactions['rating'].fillna(1).values  # Usar rating en la matriz

    user_product_matrix = coo_matrix((data, (rows, cols)), shape=(len(user_idx), len(product_idx)))

    # Asegurar que user_features tiene el mismo orden que user_idx
    user_feature_matrix = csr_matrix(user_features.loc[user_idx.keys()].values)

    # Concatenar matriz usuario-producto con características de usuario
    full_user_matrix = hstack([user_product_matrix.tocsr(), user_feature_matrix])

    return full_user_matrix, user_idx, product_idx

user_product_matrix, user_idx, product_idx = build_user_product_matrix(interactions, products, user_features)

# ---- 4️⃣ Entrenar modelo kNN ----
def train_knn_model(user_product_matrix):
    model = NearestNeighbors(metric="cosine", algorithm="brute")
    model.fit(user_product_matrix)
    return model

knn_model = train_knn_model(user_product_matrix)

# ---- 5️⃣ Recomendaciones ----
def recommend_products(user_id, top_n=5):
    if user_id not in user_idx:
        return []

    user_vector = user_product_matrix[user_idx[user_id]]
    distances, neighbors = knn_model.kneighbors(user_vector, n_neighbors=top_n+1)

    # Obtener productos recomendados de los usuarios vecinos
    neighbor_users = [list(user_idx.keys())[n] for n in neighbors[0][1:]]
    recommended_products = interactions[interactions['user_id'].isin(neighbor_users)]['product_id'].unique()

    return products[products['product_id'].isin(recommended_products)].head(top_n)

# ---- 6️⃣ Generar salida JSON ----
user_id = 11
recommended_products = recommend_products(user_id, top_n=5)
recommendations = [{"product_id": int(row["product_id"]), "name": str(row["name"]), "category": str(row["category"])} for _, row in recommended_products.iterrows()]
output = {"user_id": user_id, "recommendations": recommendations}
print(json.dumps(output, indent=4, ensure_ascii=False))

Loading dataset...
{
    "user_id": 11,
    "recommendations": [
        {
            "product_id": 1,
            "name": "Sesión de Terapia Online",
            "category": "Bienestar Mental"
        },
        {
            "product_id": 39,
            "name": "Reloj Inteligente para Fitness",
            "category": "Deportes"
        },
        {
            "product_id": 52,
            "name": "Juguetes para Mascotas",
            "category": "Mascotas"
        },
        {
            "product_id": 78,
            "name": "Vitaminas y Suplementos",
            "category": "Nutrición"
        },
        {
            "product_id": 145,
            "name": "Accesorios para Entrenamiento",
            "category": "Deportes"
        }
    ]
}


In [62]:
metrics_results = evaluate_recommendation_model(recommend_products, interactions, products, k=5)
print(json.dumps(metrics_results, indent=4, ensure_ascii=False))

{
    "MAP@K": 0.0011399214726429045,
    "NDCG@K": 0.9287890037209244,
    "HR@K": 0.0052776502983019734,
    "MRR": 0.00126
}


In [20]:
metrics_results = evaluate_recommendation_model(recommend_products_for_user_fc, interactions, products, k=5)
print(json.dumps(metrics_results, indent=4, ensure_ascii=False))

{
    "MAP@K": 0.0011399214726429045,
    "NDCG@K": 0.9287890037209244,
    "HR@K": 0.0052776502983019734,
    "MRR": 0.00126
}


## 4.3 - Modelo Hibrido

In [32]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

# ---- 1️⃣ Cargar y procesar datos ----
def load_and_clean_data():
    # Cargar datos
    print("Loading dataset...")
    users = pd.read_csv(PATH_DATASET / 'users.csv')
    products = pd.read_csv(PATH_DATASET / 'products.csv', sep=';')
    interactions = pd.read_csv(PATH_DATASET / 'interactions.csv')

    # Convertir timestamp a datetime
    interactions['timestamp'] = pd.to_datetime(interactions['timestamp'])

    # Seleccionar solo productos con stock disponible
    #products = products[products['stock_actual'] > 0]

    return users, products, interactions

users, products, interactions = load_and_clean_data()

# ---- 2️⃣ Construcción de la Matriz Usuario-Producto ----
def build_user_product_matrix(interactions, products):
    print("🔹 Creando matriz usuario-producto...")
    interactions = interactions.dropna(subset=['user_id', 'product_id'])
    interactions = interactions[interactions['product_id'].isin(products['product_id'])]

    user_idx = {user: idx for idx, user in enumerate(interactions['user_id'].unique())}
    product_idx = {product: idx for idx, product in enumerate(products['product_id'].unique())}

    rows = interactions['user_id'].map(user_idx).dropna().astype(int).values
    cols = interactions['product_id'].map(product_idx).dropna().astype(int).values
    data = np.ones(len(rows))

    user_product_matrix = coo_matrix((data, (rows, cols)), shape=(len(user_idx), len(product_idx)))
    return user_product_matrix.tocsr(), user_idx, product_idx

user_product_matrix, user_idx, product_idx = build_user_product_matrix(interactions, products)

# ---- 3️⃣ Modelo de Filtrado Basado en Contenido ----
def build_content_matrix(products):
    print("🔹 Creando matriz de características de contenido...")
    tfidf = TfidfVectorizer(stop_words='english')
    return tfidf.fit_transform(products['category'].fillna(''))

content_matrix = build_content_matrix(products).tocsr()

# ---- 4️⃣ Modelo de Filtrado Colaborativo (kNN) ----
def train_knn_model(user_product_matrix):
    print("🔹 Entrenando modelo kNN...")
    model = NearestNeighbors(metric="cosine", algorithm="brute")
    model.fit(user_product_matrix)
    return model

knn_model = train_knn_model(user_product_matrix)

# ---- 5️⃣ Recomendación de Productos (Modelo Híbrido) ----
def recommend_products_for_user_hybrid(user_id, top_n=5, alpha=0.5):
    if user_id not in user_idx:
        print(f"❌ Usuario {user_id} no encontrado en la matriz.")
        return pd.DataFrame()

    user_vector = user_product_matrix[user_idx[user_id]]
    distances, neighbors = knn_model.kneighbors(user_vector.toarray(), n_neighbors=top_n+1)

    recommended_products = set()
    for neighbor in neighbors[0][1:]:
        similar_user_id = list(user_idx.keys())[list(user_idx.values()).index(neighbor)]
        user_interactions = interactions[interactions['user_id'] == similar_user_id]
        recommended_products.update(user_interactions['product_id'].tolist())

    recommended_products = list(recommended_products)[:top_n]
    product_indices = products[products['product_id'].isin(recommended_products)].index

    if len(product_indices) == 0:
        print("❌ No hay suficientes productos en el modelo de contenido.")
        return pd.DataFrame()

    content_scores = content_matrix[product_indices].sum(axis=0)
    content_scores = np.asarray(content_scores).flatten()

    sorted_indices = np.argsort(-content_scores)[:top_n]
    sorted_products = [recommended_products[i] for i in sorted_indices if i < len(recommended_products)]

    return products[products['product_id'].isin(sorted_products)]

# ---- 6️⃣ Generar Salida JSON ----
user_id = 11
recommended_products = recommend_products_for_user_hybrid(user_id, top_n=5, alpha=0.7)

recommendations = [
    {
        "product_id": int(row["product_id"]),
        "name": str(row["name"]),
        "category": str(row["category"])
    }
    for _, row in recommended_products.iterrows()
]

output = {
    "user_id": user_id,
    "recommendations": recommendations
}

print(json.dumps(output, indent=5, ensure_ascii=False))


Loading dataset...
🔹 Creando matriz usuario-producto...
🔹 Creando matriz de características de contenido...
🔹 Entrenando modelo kNN...
{
     "user_id": 11,
     "recommendations": [
          {
               "product_id": 386,
               "name": "Rutina de Ejercicios Personalizada",
               "category": "Deportes"
          },
          {
               "product_id": 1152,
               "name": "Curso de Mindfulness",
               "category": "Bienestar Mental"
          },
          {
               "product_id": 1415,
               "name": "Vitaminas y Suplementos",
               "category": "Nutrición"
          },
          {
               "product_id": 1671,
               "name": "Clases de Yoga Online",
               "category": "Salud"
          }
     ]
}


In [33]:
metrics_results = evaluate_recommendation_model(recommend_products_for_user_hybrid, interactions, products, k=5)
print(json.dumps(metrics_results, indent=4, ensure_ascii=False))

{
    "MAP@K": 0.0011399214726429045,
    "NDCG@K": 0.9287890037209244,
    "HR@K": 0.0052776502983019734,
    "MRR": 0.00126
}


In [None]:
conversion_rates = df.groupby('category')['tipo_interaccion'].value_counts().unstack().fillna(0)
conversion_rates['conversion_rate'] = conversion_rates['Compra'] / (conversion_rates.sum(axis=1))
conversion_rates.sort_values('conversion_rate', ascending=False)

tipo_interaccion,Comentario,Compra,Consulta,Valoracion,conversion_rate
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3214,3345,3312,3206,0.255793
4,2698,2724,2661,2646,0.253891
1,4164,4065,4097,4008,0.248867
2,850,853,884,893,0.245115
3,1677,1549,1535,1619,0.24279


In [None]:
def recommend_by_category(user_id, df_interactions, df_products, top_n=5):
    df_interactions = df_interactions.merge(products, on='product_id', how='left')
    user_products = df_interactions[df_interactions['user_id'] == user_id]['name']
    user_categories = df_products[df_products['name'].isin(user_products)]['category'].unique()

    recommendations = df_products[df_products['category'].isin(user_categories)]
    return recommendations[~recommendations['name'].isin(user_products)].sample(top_n)

recommend_by_category(123, interactions, products, 3)

Unnamed: 0,product_id,name,category,descripcion,palabras_clave,precio,rating_promedio,descuento_aplicado,stock_actual
1104,1105,Sesión de Terapia Online,Bienestar Mental,Consulta con expertos en bienestar mental desd...,"Relajación, Mindfulness, Terapia",48.68,3.5,25,22
1086,1087,Vitaminas y Suplementos,Nutrición,Complementa tu dieta con vitaminas esenciales ...,"Suplementos, Dieta, Alimentación",77.39,2.5,15,123
784,785,Sesión de Terapia Online,Bienestar Mental,Consulta con expertos en bienestar mental desd...,"Terapia, Relajación, Mindfulness",27.35,5.0,30,90


In [None]:
products['palabras_clave'].str.split(',', expand=True).stack().value_counts()

Unnamed: 0,count
Deporte,449
Fitness,440
Rendimiento,405
Mindfulness,366
Relajación,345
Terapia,339
Salud,303
Bienestar,283
Ejercicio,282
Rendimiento,242


In [None]:
users['intereses']

Unnamed: 0,intereses
0,Nutrición
1,Deportes
2,Mascotas
3,Deportes
4,Nutrición
...,...
4995,Deportes
4996,Nutrición
4997,"Bienestar Mental, Desarrollo Personal"
4998,"Desarrollo Personal, Nutrición"


In [None]:
users['intereses'].str.split(',', expand=True).stack().value_counts()

Unnamed: 0,count
Bienestar Mental,754
Nutrición,752
Familia,742
Mascotas,731
Desarrollo Personal,721
Desarrollo Personal,719
Salud,710
Bienestar Mental,708
Salud,706
Deportes,704


In [None]:
products

Unnamed: 0,product_id,name,category,descripcion,palabras_clave,precio,rating_promedio,descuento_aplicado,stock_actual
0,1,Sesión de Terapia Online,Bienestar Mental,Consulta con expertos en bienestar mental desd...,"Mindfulness, Relajación, Terapia",65.59,3.0,0,30
1,2,Accesorios para Entrenamiento,Deportes,"Cintas, mancuernas y más para potenciar tu ent...","Rendimiento, Deporte, Fitness",124.27,2.5,30,426
2,3,Entrenador Personal Virtual,Bienestar Mental,Accede a planes de entrenamiento personalizado...,"Relajación, Terapia, Mindfulness",98.49,5.0,20,385
3,4,Rutina de Ejercicios Personalizada,Deportes,Planes de ejercicios adaptados a tus objetivos...,"Deporte, Fitness, Rendimiento",99.61,3.0,30,329
4,5,Juguetes para Mascotas,Mascotas,Diversión asegurada para tu mascota con juguet...,"Mascotas, Juguetes, Cuidado",49.39,2.5,25,444
...,...,...,...,...,...,...,...,...,...
1995,1996,Sesión de Terapia Online,Bienestar Mental,Consulta con expertos en bienestar mental desd...,"Mindfulness, Terapia, Relajación",31.85,4.0,25,429
1996,1997,Ropa Deportiva,Deportes,Prendas cómodas y de alta calidad para mejorar...,"Deporte, Rendimiento, Fitness",109.26,2.5,15,178
1997,1998,Spa y Masajes,Salud,Tratamientos relajantes para reducir el estrés...,"Ejercicio, Bienestar, Salud",61.27,4.5,30,62
1998,1999,Spa y Masajes,Salud,Tratamientos relajantes para reducir el estrés...,"Bienestar, Ejercicio, Salud",47.84,4.0,0,418


In [None]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          2000 non-null   int64  
 1   name                2000 non-null   object 
 2   category            2000 non-null   object 
 3   descripcion         2000 non-null   object 
 4   palabras_clave      2000 non-null   object 
 5   precio              2000 non-null   float64
 6   rating_promedio     2000 non-null   float64
 7   descuento_aplicado  2000 non-null   int64  
 8   stock_actual        2000 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 140.8+ KB


In [None]:
products[['name', 'descripcion']].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
name,descripcion,Unnamed: 2_level_1
Clases de Yoga Online,Sesiones guiadas de yoga para mejorar la flexibilidad y reducir el estrés.,152
Monitor de Ritmo Cardíaco,Dispositivo preciso para medir tu frecuencia cardíaca en tiempo real.,149
Ropa Deportiva,Prendas cómodas y de alta calidad para mejorar tu rendimiento deportivo.,145
Entrenamiento en Casa,Rutinas de ejercicios para mantenerte en forma sin necesidad de equipo especial.,140
Plan de Alimentación Saludable,Menús equilibrados diseñados por nutricionistas para mejorar tu salud.,138
Juguetes para Mascotas,Diversión asegurada para tu mascota con juguetes interactivos y seguros.,137
Sesión de Terapia Online,Consulta con expertos en bienestar mental desde la comodidad de tu hogar.,137
Curso de Meditación,Aprende técnicas de mindfulness y relajación para reducir la ansiedad.,134
Spa y Masajes,Tratamientos relajantes para reducir el estrés y mejorar la circulación.,133
Curso de Mindfulness,Desarrolla hábitos de atención plena para mejorar tu bienestar mental.,132


In [None]:
df.groupby(['name', 'tipo_interaccion'])['user_id'].count().unstack().fillna(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
tipo_interaccion,comentario,Unnamed: 2_level_1
Comentario,Excelente,2133
Comentario,Podría mejorar,2124
Comentario,Aceptable,2105
Comentario,Malo,2065
Comentario,Recomendado,2009


In [None]:
# ----------------------
# LIMPIEZA Y TRANSFORMACIÓN
# ----------------------

## USUARIOS
# Convertir ID a string
users['user_id'] = users['user_id'].astype(str)
# Normalizar ingresos (si aplica)

## PRODUCTOS
# Convertir ID a string
products['product_id'] = products['product_id'].astype(str)

## INTERACCIONES
# Convertir ID a string
interactions['user_id'] = interactions['user_id'].astype(str)
interactions['product_id'] = interactions['product_id'].astype(str)
# Convertir timestamp a formato datetime


### 4.1 Crear una matriz usuario-producto a partir de interacciones.

In [None]:
%%time
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Preprocesamiento: Filtrar datos relevantes
ratings = interactions[['user_id', 'product_id', 'rating']].dropna()
print(ratings.shape)

# Definir el formato para Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'product_id', 'rating']], reader)

# Entrenar modelo SVD
svd = SVD()
cross_validate(svd, data, cv=5, verbose=True)

# Entrenar modelo KNN
sim_options = {'name': 'cosine', 'user_based': True}  # KNN basado en usuarios
knn = KNNBasic(sim_options=sim_options)
cross_validate(knn, data, cv=5, verbose=True)

# Evaluación con métricas adicionales
def evaluate_model(model, trainset, testset):
    model.fit(trainset)
    predictions = model.test(testset)
    true_ratings = np.array([pred.r_ui for pred in predictions])
    pred_ratings = np.array([pred.est for pred in predictions])
    rmse = mean_squared_error(true_ratings, pred_ratings) ** 0.5
    mae = mean_absolute_error(true_ratings, pred_ratings)
    return rmse, mae

# Crear conjunto de entrenamiento y prueba
trainset = data.build_full_trainset()
testset = trainset.build_testset()

# Evaluar modelos
rmse_svd, mae_svd = evaluate_model(svd, trainset, testset)
rmse_knn, mae_knn = evaluate_model(knn, trainset, testset)

print(f"SVD - RMSE: {rmse_svd:.4f}, MAE: {mae_svd:.4f}")
print(f"KNN - RMSE: {rmse_knn:.4f}, MAE: {mae_knn:.4f}")

(10324, 3)
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4411  1.4566  1.4516  1.4271  1.4464  1.4446  0.0102  
MAE (testset)     1.2388  1.2587  1.2539  1.2297  1.2455  1.2453  0.0104  
Fit time          0.10    0.11    0.11    0.12    0.10    0.11    0.01    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4238  1.4191  1.4104  1.41

In [None]:
from surprise import SVD, KNNBasic, NMF, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

trainset, testset = train_test_split(data, test_size=0.2)

# Entrenar modelos base
svd = SVD()
knn = KNNBasic()
nmf = NMF()

svd.fit(trainset)
knn.fit(trainset)
nmf.fit(trainset)

# Generar predicciones en testset
svd_preds = np.array([svd.predict(uid, iid).est for (uid, iid, _) in testset])
knn_preds = np.array([knn.predict(uid, iid).est for (uid, iid, _) in testset])
nmf_preds = np.array([nmf.predict(uid, iid).est for (uid, iid, _) in testset])
true_ratings = np.array([rating for (_, _, rating) in testset])

# Híbrido 1: Promedio Ponderado
weights = [0.5, 0.5]  # Ajustar pesos según desempeño
hybrid_preds_weighted = weights[0] * svd_preds + weights[1] * knn_preds

# Evaluar modelo híbrido ponderado
rmse_hybrid = np.sqrt(mean_squared_error(true_ratings, hybrid_preds_weighted))
mae_hybrid = mean_absolute_error(true_ratings, hybrid_preds_weighted)

# Híbrido 2: Stacking con Ridge Regression
X_train = np.vstack((svd_preds, knn_preds, nmf_preds)).T
y_train = true_ratings
stacking_model = Ridge()
stacking_model.fit(X_train, y_train)
stacking_preds = stacking_model.predict(X_train)

# Evaluar modelo Stacking
rmse_stacking = np.sqrt(mean_squared_error(y_train, stacking_preds))
mae_stacking = mean_absolute_error(y_train, stacking_preds)

# Resultados
print(f'RMSE Híbrido Ponderado: {rmse_hybrid:.4f}, MAE: {mae_hybrid:.4f}')
print(f'RMSE Stacking: {rmse_stacking:.4f}, MAE: {mae_stacking:.4f}')


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE Híbrido Ponderado: 1.4233, MAE: 1.2212
RMSE Stacking: 1.4159, MAE: 1.2055


### Optimización de hiperparametros

In [None]:
import optuna
from surprise import SVD, KNNBasic
from surprise.model_selection import cross_validate

# Función objetivo para optimizar SVD
def objective_svd(trial):
    """Optimiza SVD buscando los mejores hiperparámetros."""
    n_factors = trial.suggest_int('n_factors', 20, 200)
    reg_all = trial.suggest_float('reg_all', 0.001, 0.1, log=True)
    lr_all = trial.suggest_float('lr_all', 0.002, 0.2, log=True)

    model = SVD(n_factors=n_factors, reg_all=reg_all, lr_all=lr_all)
    scores = cross_validate(model, data, measures=['RMSE'], cv=3, verbose=False)
    return scores['test_rmse'].mean()

# Optimización de SVD
study_svd = optuna.create_study(direction='minimize')
study_svd.optimize(objective_svd, n_trials=30)
print("Mejores parámetros para SVD:", study_svd.best_params)

# Función objetivo para optimizar KNNBasic
def objective_knn(trial):
    """Optimiza KNNBasic buscando los mejores hiperparámetros."""
    k = trial.suggest_int('k', 10, 100)
    sim_options = {'name': 'cosine', 'user_based': False}
    model = KNNBasic(k=k, sim_options=sim_options)

    scores = cross_validate(model, data, measures=['RMSE'], cv=3, verbose=False)
    return scores['test_rmse'].mean()

# Optimización de KNN
study_knn = optuna.create_study(direction='minimize')
study_knn.optimize(objective_knn, n_trials=30)
print("Mejores parámetros para KNN:", study_knn.best_params)

# Comparación de métricas antes y después de la optimización
best_svd = SVD(**study_svd.best_params)
best_knn = KNNBasic(k=study_knn.best_params['k'], sim_options={'name': 'cosine', 'user_based': False})

results_svd = cross_validate(best_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
results_knn = cross_validate(best_knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

print("SVD Optimizado - RMSE:", results_svd['test_rmse'].mean(), "MAE:", results_svd['test_mae'].mean())
print("KNN Optimizado - RMSE:", results_knn['test_rmse'].mean(), "MAE:", results_knn['test_mae'].mean())


[I 2025-02-26 21:25:26,453] A new study created in memory with name: no-name-ed0ec594-1681-4d1f-ba9c-89d6da34c586
[I 2025-02-26 21:25:26,735] Trial 0 finished with value: 1.4256531109975084 and parameters: {'n_factors': 42, 'reg_all': 0.02054593814133874, 'lr_all': 0.002567078921660392}. Best is trial 0 with value: 1.4256531109975084.
[I 2025-02-26 21:25:27,195] Trial 1 finished with value: 1.4453166536100042 and parameters: {'n_factors': 41, 'reg_all': 0.0025061059193313222, 'lr_all': 0.0060411549501408115}. Best is trial 0 with value: 1.4256531109975084.
[I 2025-02-26 21:25:27,400] Trial 2 finished with value: 1.5876542204502126 and parameters: {'n_factors': 29, 'reg_all': 0.07682225378320391, 'lr_all': 0.1548320273084648}. Best is trial 0 with value: 1.4256531109975084.
[I 2025-02-26 21:25:27,879] Trial 3 finished with value: 1.4267231637452025 and parameters: {'n_factors': 158, 'reg_all': 0.02196455129145359, 'lr_all': 0.0021059903381675693}. Best is trial 0 with value: 1.425653110

Mejores parámetros para SVD: {'n_factors': 43, 'reg_all': 0.025779006292121587, 'lr_all': 0.0020088913355571893}
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:38,904] Trial 0 finished with value: 1.4174877863238484 and parameters: {'k': 20}. Best is trial 0 with value: 1.4174877863238484.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:39,433] Trial 1 finished with value: 1.4165415672140051 and parameters: {'k': 77}. Best is trial 1 with value: 1.4165415672140051.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


[I 2025-02-26 21:25:39,695] Trial 2 finished with value: 1.4156484241082454 and parameters: {'k': 67}. Best is trial 2 with value: 1.4156484241082454.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


[I 2025-02-26 21:25:39,950] Trial 3 finished with value: 1.4173846848629676 and parameters: {'k': 72}. Best is trial 2 with value: 1.4156484241082454.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:40,297] Trial 4 finished with value: 1.417945486817193 and parameters: {'k': 21}. Best is trial 2 with value: 1.4156484241082454.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:40,644] Trial 5 finished with value: 1.4160604248444877 and parameters: {'k': 86}. Best is trial 2 with value: 1.4156484241082454.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:41,013] Trial 6 finished with value: 1.418170305435756 and parameters: {'k': 41}. Best is trial 2 with value: 1.4156484241082454.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:41,703] Trial 7 finished with value: 1.4185406563817986 and parameters: {'k': 85}. Best is trial 2 with value: 1.4156484241082454.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:42,043] Trial 8 finished with value: 1.4164456764585818 and parameters: {'k': 10}. Best is trial 2 with value: 1.4156484241082454.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


[I 2025-02-26 21:25:42,333] Trial 9 finished with value: 1.416394540994065 and parameters: {'k': 78}. Best is trial 2 with value: 1.4156484241082454.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:42,615] Trial 10 finished with value: 1.4174538692376295 and parameters: {'k': 59}. Best is trial 2 with value: 1.4156484241082454.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


[I 2025-02-26 21:25:42,886] Trial 11 finished with value: 1.4154671473496585 and parameters: {'k': 100}. Best is trial 11 with value: 1.4154671473496585.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:43,189] Trial 12 finished with value: 1.41624886193992 and parameters: {'k': 97}. Best is trial 11 with value: 1.4154671473496585.


Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:43,731] Trial 13 finished with value: 1.4174726929482582 and parameters: {'k': 54}. Best is trial 11 with value: 1.4154671473496585.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


[I 2025-02-26 21:25:43,997] Trial 14 finished with value: 1.4164922616155053 and parameters: {'k': 100}. Best is trial 11 with value: 1.4154671473496585.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


[I 2025-02-26 21:25:44,269] Trial 15 finished with value: 1.4170145116316892 and parameters: {'k': 60}. Best is trial 11 with value: 1.4154671473496585.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


[I 2025-02-26 21:25:44,553] Trial 16 finished with value: 1.417687089678604 and parameters: {'k': 42}. Best is trial 11 with value: 1.4154671473496585.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


[I 2025-02-26 21:25:44,832] Trial 17 finished with value: 1.4165255707782078 and parameters: {'k': 67}. Best is trial 11 with value: 1.4154671473496585.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:45,121] Trial 18 finished with value: 1.4169556595805535 and parameters: {'k': 46}. Best is trial 11 with value: 1.4154671473496585.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:45,671] Trial 19 finished with value: 1.4163986384106984 and parameters: {'k': 90}. Best is trial 11 with value: 1.4154671473496585.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


[I 2025-02-26 21:25:45,943] Trial 20 finished with value: 1.4167099268194614 and parameters: {'k': 30}. Best is trial 11 with value: 1.4154671473496585.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:46,242] Trial 21 finished with value: 1.4167667225596903 and parameters: {'k': 88}. Best is trial 11 with value: 1.4154671473496585.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:46,509] Trial 22 finished with value: 1.4172089328835602 and parameters: {'k': 92}. Best is trial 11 with value: 1.4154671473496585.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


[I 2025-02-26 21:25:46,844] Trial 23 finished with value: 1.4180840661485083 and parameters: {'k': 82}. Best is trial 11 with value: 1.4154671473496585.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:47,379] Trial 24 finished with value: 1.417624028374634 and parameters: {'k': 71}. Best is trial 11 with value: 1.4154671473496585.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


[I 2025-02-26 21:25:47,637] Trial 25 finished with value: 1.4172637880855017 and parameters: {'k': 100}. Best is trial 11 with value: 1.4154671473496585.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:47,925] Trial 26 finished with value: 1.417949597769956 and parameters: {'k': 92}. Best is trial 11 with value: 1.4154671473496585.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:48,232] Trial 27 finished with value: 1.4171480557010767 and parameters: {'k': 64}. Best is trial 11 with value: 1.4154671473496585.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:48,499] Trial 28 finished with value: 1.4173128180890935 and parameters: {'k': 80}. Best is trial 11 with value: 1.4154671473496585.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[I 2025-02-26 21:25:48,767] Trial 29 finished with value: 1.4166089480061188 and parameters: {'k': 73}. Best is trial 11 with value: 1.4154671473496585.


Mejores parámetros para KNN: {'k': 100}
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4473  1.4215  1.4224  1.4289  1.4064  1.4253  0.0132  
MAE (testset)     1.2514  1.2249  1.2234  1.2192  1.2039  1.2246  0.0154  
Fit time          0.08    0.08    0.08    0.05    0.05    0.07    0.01    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)   

El mejor trial hasta ahora tiene:

n_factors = 65
reg_all = 0.0627
lr_all = 0.00205
Mejor métrica = 1.4228
📌 Algunos puntos a considerar:

Rango de parámetros: Los mejores valores de reg_all y lr_all están cercanos a los límites inferiores. Podría valer la pena hacer otra búsqueda con un rango más ajustado en estos valores.
Tendencia de n_factors: Parece que valores en el rango de 60-150 están funcionando bien.
Regularización y tasa de aprendizaje: Los valores óptimos de reg_all y lr_all sugieren que el modelo se beneficia de una regularización más alta y una tasa de aprendizaje baja.

In [None]:
best_trial = study_svd.best_trial
print(f"Mejores hiperparámetros: {best_trial.params}")

import json

with open("best_svd_params.json", "w") as f:
    json.dump(best_trial.params, f)

Mejores hiperparámetros: {'n_factors': 43, 'reg_all': 0.025779006292121587, 'lr_all': 0.0020088913355571893}


In [None]:
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

# Modelo base
svd_default = SVD()
base_results = cross_validate(svd_default, data, cv=5, verbose=True)

# Modelo optimizado
svd_tuned = SVD(**best_trial.params)
tuned_results = cross_validate(svd_tuned, data, cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4326  1.4296  1.4312  1.4571  1.4704  1.4442  0.0166  
MAE (testset)     1.2337  1.2260  1.2347  1.2583  1.2775  1.2461  0.0191  
Fit time          0.13    0.09    0.11    0.10    0.11    0.11    0.01    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4233  1.4302  1.4224  1.4180  1.4271  1.4242  0.0042  
MAE (testset)     1.2187  1.2277  1.2227  1.2225  1.2280  1.2239  0.0035  
Fit time          0.05    0.06    0.05    0.05    0.08    0.06    0.01    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


In [None]:
from surprise import Dataset
from surprise.model_selection import train_test_split

# Cargar datos y separar en entrenamiento y prueba
trainset, testset = train_test_split(data, test_size=0.2)
model = SVD(**best_trial.params)  # Usamos los mejores hiperparámetros encontrados
model.fit(trainset)

df = interactions.copy()
# Obtener todos los usuarios y productos únicos
all_users = list(set(df['user_id']))
all_items = list(set(df['product_id']))

# Crear predicciones para todos los productos que un usuario NO ha evaluado
def get_top_n_recommendations(model, user_id, n=5):
    known_items = set(df[df['user_id'] == user_id]['product_id'])  # Productos ya vistos
    items_to_predict = [item for item in all_items if item not in known_items]

    predictions = [model.predict(user_id, item) for item in items_to_predict]
    predictions.sort(key=lambda x: x.est, reverse=True)  # Ordenar por rating estimado

    return [(pred.iid, pred.est) for pred in predictions[:n]]

# Prueba con un usuario
user_sample = all_users[0]
print(f"Top-5 recomendaciones para el usuario {user_sample}:")
print(get_top_n_recommendations(model, user_sample, n=5))

Top-5 recomendaciones para el usuario 766:
[('1391', 3.3294361440027442), ('1522', 3.32755772867156), ('1328', 3.302215831241221), ('1041', 3.2786967257927535), ('668', 3.2494356308204053)]


In [None]:
# Timestap
df = df.sort_values(by=['user_id', 'timestamp'])
df['interaction_order'] = df.groupby('user_id').cumcount() + 1
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['month'] = df['timestamp'].dt.month
df['time_since_last_interaction'] = df.groupby('user_id')['timestamp'].diff().dt.total_seconds()
df['time_since_last_interaction'] = df['time_since_last_interaction'].fillna(0)
df['interaction_sequence'] = df.groupby('user_id')['product_id'].transform(lambda x: list(x))
df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   user_id             50000 non-null  object        
 1   product_id          50000 non-null  object        
 2   tipo_interaccion    50000 non-null  object        
 3   rating              10324 non-null  float64       
 4   comentario          10436 non-null  object        
 5   timestamp           50000 non-null  datetime64[ns]
 6   metodo_pago         50000 non-null  object        
 7   hour                50000 non-null  int32         
 8   dayofweek           50000 non-null  int32         
 9   edad                50000 non-null  int64         
 10  genero              50000 non-null  object        
 11  nivel_ingresos      50000 non-null  object        
 12  nivel_educativo     50000 non-null  object        
 13  intereses           50000 non-null  object    

In [None]:
from sklearn.preprocessing import LabelEncoder

# Codificar variables categóricas
label_encoders = {}
for col in ["category", "nivel_ingresos", "nivel_educativo"]:  # Agrega más según el dataset
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Guardar el codificador para decodificar luego

# Summary