# Modelo V2

In [1]:
# FLAGS
FLAG_DATASET_DL = False

## 1. Carga de datos y librerías

In [2]:
# Actualizar pip
%pip install --upgrade pip
# Dependencias para graficar y manipular datos
%pip install pandas matplotlib tqdm seaborn ipywidgets
# Dependencias para leer archivos Parquet
%pip install pyarrow fastparquet
# Para predicción
%pip install scikit-learn lightfm recommenders

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Importar librerías globales

# Manejo de datos y visualización
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm.auto import tqdm

# Crear directorios
PATH_DATASETS = "datasets"
PATH_DATASETS_FOODCOM = os.path.join(PATH_DATASETS, "foodcom")
os.makedirs(PATH_DATASETS, exist_ok=True)

In [4]:
if FLAG_DATASET_DL:
  # Descargar "Recipes and Reviews" de Food.com
  os.system(f"curl -L -o {PATH_DATASETS_FOODCOM}.zip https://www.kaggle.com/api/v1/datasets/download/irkaal/foodcom-recipes-and-reviews")
  os.system(f"unzip -o {PATH_DATASETS_FOODCOM}.zip -d {PATH_DATASETS_FOODCOM}")
  # Eliminar el zip
  os.remove(f"{PATH_DATASETS_FOODCOM}.zip")

  # Cita: Dominio público (https://www.kaggle.com/datasets/irkaal/foodcom-recipes-and-reviews)

In [5]:
# Foodcom dataframes
df_foodcom_recipes = pd.read_parquet(os.path.join(PATH_DATASETS_FOODCOM, "recipes.parquet"))
df_foodcom_reviews = pd.read_parquet(os.path.join(PATH_DATASETS_FOODCOM, "reviews.parquet"))

## 2. Preprocesamiento de datos

### 2.1. Food.com

En df_foodcom_reviews:
- "AuthorId" -> "user_id" (int)
- "RecipeId" -> "recipe_id" (int)
- "Rating" -> "rating" (float)

In [6]:
df_ratings = df_foodcom_reviews.rename(columns={
    'AuthorId': 'user_id',
    'RecipeId': 'recipe_id',
    'Rating'  : 'rating'
})[['user_id', 'recipe_id', 'rating']]

df_ratings['user_id']   = df_ratings['user_id'].astype(np.int32)
df_ratings['recipe_id'] = df_ratings['recipe_id'].astype(np.int32)
df_ratings['rating']    = df_ratings['rating'].astype(np.int8)

# Mostrar información del dataframe de ratings
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401982 entries, 0 to 1401981
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   user_id    1401982 non-null  int32
 1   recipe_id  1401982 non-null  int32
 2   rating     1401982 non-null  int8 
dtypes: int32(2), int8(1)
memory usage: 12.0 MB


En df_foodcom_recipes:
- "RecipeId" -> "recipe_id" (int)

In [7]:
df_recipes = df_foodcom_recipes.rename(columns={
    'RecipeId': 'recipe_id'
}).copy()

df_recipes['recipe_id'] = df_recipes['recipe_id'].astype(int)

# Mostrar información del dataframe de recetas
df_recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype              
---  ------                      --------------   -----              
 0   recipe_id                   522517 non-null  int64              
 1   Name                        522517 non-null  object             
 2   AuthorId                    522517 non-null  int32              
 3   AuthorName                  522517 non-null  object             
 4   CookTime                    439972 non-null  object             
 5   PrepTime                    522517 non-null  object             
 6   TotalTime                   522517 non-null  object             
 7   DatePublished               522517 non-null  datetime64[us, UTC]
 8   Description                 522512 non-null  object             
 9   Images                      522516 non-null  object             
 10  RecipeCategory              521766 non-null 

## 3. Modelo

In [8]:
# ===============================================================
# Híbrido con LightFM (colaborative + metadata)
# ===============================================================
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k
from recommenders.utils.timer import Timer


class FoodRecommenderV2:
    """
    Recomendador híbrido que combina LightFM con metadatos de recetas.
    Utiliza un modelo de LightFM entrenado con WARP (Weighted Approximate-Rank Pairwise).
    """

    def __init__(self, df_ratings: pd.DataFrame, df_recipes: pd.DataFrame, verbose: bool = False, test_percentage: float = 0.25):
        self.verbose = verbose
        # Filtramos los ratings para evitar sparsity extrema
        user_filter = df_ratings['user_id'].value_counts() >= 10
        recipe_filter = df_ratings['recipe_id'].value_counts() >= 10
        self.df_ratings = df_ratings[
            df_ratings['user_id'].isin(user_filter[user_filter].index) &
            df_ratings['recipe_id'].isin(recipe_filter[recipe_filter].index)
        ].copy()

        self.log(f"Ratings filtrados: {len(self.df_ratings):,}")
    
        # Obtenemos los metadatos de recetas
        # TODO: Incluir metadatos de recetas como ingredientes, categorías, etc.
        
        # Creamos el dataset de LightFM
        self.dataset = Dataset()
        self.dataset.fit(
            users=df_ratings['user_id'].unique(),
            items=df_ratings['recipe_id'].unique()
        )
        self.num_users, self.num_items = self.dataset.interactions_shape()
        self.log(f"Dataset creado con {self.num_users:,} usuarios y {self.num_items:,} recetas.")

        # Creamos las interacciones
        self.interactions, self.weights = self.dataset.build_interactions(self.df_ratings[['user_id', 'recipe_id', 'rating']].to_numpy())
        self.log(f"Matriz de interacciones de {self.interactions.get_shape()}.")

        # Division de train/test
        self.train_interactions, self.test_interactions = random_train_test_split(
            self.interactions,
            test_percentage=test_percentage,
            random_state=np.random.RandomState(42)
        )
        self.log(f"División train/test: {self.train_interactions.get_shape()} (train) y {self.test_interactions.get_shape()} (test).")

        # Creamos el modelo de LightFM
        self.model = LightFM(
            loss='warp',
            no_components=30,
            learning_rate=0.05,
            random_state=np.random.RandomState(42)
        )
    
    def fit(self, epochs: int = 10):
        """
        Entrena el modelo de LightFM.
        """
        self.log(f"Entrenando el modelo por {epochs} épocas...")
        self.model.fit(
            self.train_interactions,
            epochs=epochs,
            num_threads=16,
            verbose=self.verbose
        )
        self.log("Entrenamiento completado.")
    
    def evaluate(self) -> dict:
        """
        Evalúa el modelo y retorna las métricas evaluadas.
        """
        self.log("Iniciando evaluación del modelo...")
        results = {}

        with Timer() as test_time:
            results["precision@10"] = precision_at_k(self.model, self.test_interactions,
                                                self.train_interactions, k=10, num_threads=16).mean()
            results["recall@10"] = recall_at_k(self.model, self.test_interactions,
                                                self.train_interactions, k=10, num_threads=16).mean()
        self.log(f"Evaluación de precisión y recall: {test_time.interval:.1f} segundos.")

        
        self.log(f'Precisión@10 en test: {results["precision@10"]:.4f}')
        self.log(f'Recall@10 en test: {results["recall@10"]:.4f}')

        return results

    def recommend(self, user_id, num_items: int = 10) -> pd.DataFrame:
        """
        Recomienda recetas para un usuario dado.
        """
        self.log(f"Recomendando {num_items} recetas para el usuario {user_id}...")
        scores = self.model.predict(user_id, np.arange(self.num_items))
        rid_to_iid_map = self.dataset.mapping()[2]
        recipe_id_map = {v: k for k, v in rid_to_iid_map.items()}
        # Obtenemos los IDs de las recetas
        top_items = np.argsort(-scores)[:num_items]
        # Convertimos los índices a IDs de recetas
        top_items = [recipe_id_map[item] for item in top_items]
        
        # Convertimos a DataFrame
        recommendations = pd.DataFrame({
            'recipe_id': top_items[:num_items],
            'score': scores[np.argsort(-scores)][:num_items]
        })
        
        return recommendations
    
    def log(self, message: str):
        """
        Método para imprimir mensajes de log.
        """
        if self.verbose:
            print(f"[FoodRec] {message}")

# Instanciar y entrenar el recomendador
recommender = FoodRecommenderV2(df_ratings, df_recipes, verbose=True)
recommender.fit(epochs=10)
results = recommender.evaluate()

# Mostrar recomendaciones para un usuario específico
user_id = 0
recommendations = recommender.recommend(user_id, num_items=10)
display(df_recipes[df_recipes['recipe_id'].isin(recommendations['recipe_id'])][['recipe_id', 'Name', 'RecipeCategory']].merge(recommendations, on='recipe_id').sort_values(by='score', ascending=False))

[FoodRec] Ratings filtrados: 530,589
[FoodRec] Dataset creado con 271,907 usuarios y 271,678 recetas.
[FoodRec] Matriz de interacciones de (271907, 271678).
[FoodRec] División train/test: (271907, 271678) (train) y (271907, 271678) (test).
[FoodRec] Entrenando el modelo por 10 épocas...


Epoch: 100%|██████████| 10/10 [00:01<00:00,  6.47it/s]


[FoodRec] Entrenamiento completado.
[FoodRec] Iniciando evaluación del modelo...
[FoodRec] Evaluación de precisión y recall: 70.2 segundos.
[FoodRec] Precisión@10 en test: 0.0156
[FoodRec] Recall@10 en test: 0.0223
[FoodRec] Recomendando 10 recetas para el usuario 0...


Unnamed: 0,recipe_id,Name,RecipeCategory,score
4,45809,Bourbon Chicken,Chicken Breast,1.867039
1,27208,To Die for Crock Pot Roast,One Dish Meal,1.705763
9,89204,Crock-Pot Chicken With Black Beans &amp; Cream...,One Dish Meal,1.695655
2,32204,&quot;Whatever Floats Your Boat&quot; Brownies!,Bar Cookie,1.690492
8,69173,Kittencal's Italian Melt-In-Your-Mouth Meatballs,Meat,1.638588
3,39087,Creamy Cajun Chicken Pasta,Chicken Breast,1.638141
5,54257,"Yes, Virginia There is a Great Meatloaf",Meatloaf,1.629452
6,67256,Best Ever Banana Cake With Cream Cheese Frosting,Dessert,1.606572
0,25690,Pancakes,Breakfast,1.600838
7,68955,Japanese Mum's Chicken,Chicken Thigh & Leg,1.579192


## Experimentos (no finalizados)

In [9]:
# Extraemos un set con todas las keywords que existen en el dataset
keywords_set = set()
for keywords in tqdm(df_recipes["Keywords"].dropna()):
    for k in keywords:
        if isinstance(k, str):
            keywords_set.add(k.lower())
print(f"Total de keywords únicas: {len(keywords_set)}")

# Extraemos un set con todas las categorías de recetas
categories_set = set()
for category in tqdm(df_recipes["RecipeCategory"].dropna()):
    categories_set.add(category.lower())
print(f"Total de categorías únicas: {len(categories_set)}")

# Extraemos un set con todos los ingredientes que existen en el dataset
ingredients_set = set()
for ingredients in tqdm(df_recipes["RecipeIngredientParts"].dropna()):
    for ingredient in ingredients:
        ingredients_set.add(ingredient.lower())
print(f"Total de ingredientes únicos: {len(ingredients_set)}")

# Merge con metadatos de receta

# Creamos en df_recipes la flag is_vegan:
#df_recipes['RecipeCategory'] = df_recipes['RecipeCategory'].fillna('').astype(str)
#df_recipes['is_vegan'] = df_recipes['RecipeCategory'].str.contains(
#    'Vegan', case=False, na=False
#).astype(int)

# Hacemos merge de df_ratings_filt con df_recipes
#df_all = df_ratings_filt.merge(
#    df_recipes[['recipe_id']],
#    on='recipe_id',
#    how='left'
#)

# Comprobamos NA en nutritional cols (si hay, se rellenan con 0)
#df_all[nutri_cols] = df_all[nutri_cols].fillna(0.0)
#df_all['is_vegan'] = df_all['is_vegan'].fillna(0).astype(int)

#df_all.info()

  0%|          | 0/522517 [00:00<?, ?it/s]

Total de keywords únicas: 314


  0%|          | 0/521766 [00:00<?, ?it/s]

Total de categorías únicas: 311


  0%|          | 0/522517 [00:00<?, ?it/s]

Total de ingredientes únicos: 7309
