<a href="https://colab.research.google.com/github/ResistorCat/recsys-project/blob/main/model/LightFM_Foodcom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
FLAG_DATASET_DL = False

## 1. Carga de datos y librerías

In [2]:
# Actualizar pip
%pip install --upgrade pip setuptools wheel
# Dependencias para graficar y manipular datos
%pip install pandas matplotlib tqdm seaborn ipywidgets
# Dependencias para leer archivos Parquet
%pip install pyarrow fastparquet
# Para predicción
%pip install scikit-learn lightfm recommenders

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Importar librerías globales

# Manejo de datos y visualización
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm.auto import tqdm
import zipfile


# Crear directorios
PATH_DATASETS = "datasets"
PATH_DATASETS_FOODCOM = os.path.join(PATH_DATASETS, "foodcom")
os.makedirs(PATH_DATASETS, exist_ok=True)

In [4]:
PATH_FOODCOM_RECIPES = os.path.join(PATH_DATASETS_FOODCOM, "recipes.parquet")
PATH_FOODCOM_REVIEWS = os.path.join(PATH_DATASETS_FOODCOM, "reviews.parquet")

In [5]:
if FLAG_DATASET_DL:
    # Descargar "Recipes and Reviews" de Food.com
    os.system(
        f"curl -L -o {PATH_DATASETS_FOODCOM}.zip https://www.kaggle.com/api/v1/datasets/download/irkaal/foodcom-recipes-and-reviews"
    )
    # os.system(f"unzip -o {PATH_DATASETS_FOODCOM}.zip -d {PATH_DATASETS_FOODCOM}")

    # Cita: Dominio público (https://www.kaggle.com/datasets/irkaal/foodcom-recipes-and-reviews)
    zip_path = PATH_DATASETS_FOODCOM + ".zip"

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(PATH_DATASETS_FOODCOM)

    # Eliminar el zip
    os.remove(f"{PATH_DATASETS_FOODCOM}.zip")

    print("✅ Archivos descomprimidos en:", PATH_DATASETS_FOODCOM)


## 2. Preprocesamiento de datos

In [6]:
df_recipes = pd.read_parquet(PATH_FOODCOM_RECIPES)
df_reviews = pd.read_parquet(PATH_FOODCOM_REVIEWS)

# Convertir columnas
df_recipes["RecipeId"] = pd.to_numeric(
    df_recipes["RecipeId"], errors="coerce", downcast="integer"
)

# Eliminar filas NaN
df_recipes.dropna(inplace=True)
df_reviews.dropna(inplace=True)


# Renombrar columnas
def camel_to_snake(name):
    """Convert CamelCase to snake_case"""
    name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()


# Rename df_recipes columns
recipes_column_mapping = {"RecipeId": "item_id", "AuthorId": "user_id"}

# Add snake_case mapping for all other columns
for col in df_recipes.columns:
    if col not in recipes_column_mapping:
        recipes_column_mapping[col] = camel_to_snake(col)

df_recipes.rename(columns=recipes_column_mapping, inplace=True)

# Rename df_reviews columns
reviews_column_mapping = {
    "RecipeId": "item_id",
    "AuthorId": "user_id",
    "Rating": "rating",
}

# Add snake_case mapping for all other columns
for col in df_reviews.columns:
    if col not in reviews_column_mapping:
        reviews_column_mapping[col] = camel_to_snake(col)

df_reviews.rename(columns=reviews_column_mapping, inplace=True)

print("✅ Columnas renombradas")
print("Columnas df_recipes:", list(df_recipes.columns))
print("Columnas df_reviews:", list(df_reviews.columns))

display(df_recipes.head(2))
display(df_reviews.head(2))
display(df_recipes.info())
display(df_reviews.info())

✅ Columnas renombradas
Columnas df_recipes: ['item_id', 'name', 'user_id', 'author_name', 'cook_time', 'prep_time', 'total_time', 'date_published', 'description', 'images', 'recipe_category', 'keywords', 'recipe_ingredient_quantities', 'recipe_ingredient_parts', 'aggregated_rating', 'review_count', 'calories', 'fat_content', 'saturated_fat_content', 'cholesterol_content', 'sodium_content', 'carbohydrate_content', 'fiber_content', 'sugar_content', 'protein_content', 'recipe_servings', 'recipe_yield', 'recipe_instructions']
Columnas df_reviews: ['review_id', 'item_id', 'user_id', 'author_name', 'rating', 'review', 'date_submitted', 'date_modified']


Unnamed: 0,item_id,name,user_id,author_name,cook_time,prep_time,total_time,date_published,description,images,...,saturated_fat_content,cholesterol_content,sodium_content,carbohydrate_content,fiber_content,sugar_content,protein_content,recipe_servings,recipe_yield,recipe_instructions
3,41,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03 14:54:00+00:00,This dish is best prepared a day in advance to...,[https://img.sndimg.com/food/image/upload/w_55...,...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"[Drain the tofu, carefully squeezing out exces..."
5,43,Best Blackbottom Pie,34879,Barefoot Beachcomber,PT2H,PT20M,PT2H20M,1999-08-21 10:35:00+00:00,Make and share this Best Blackbottom Pie recip...,[],...,10.9,94.3,267.6,58.0,1.8,42.5,7.0,8.0,1 9-inch pie,"[Graham Cracker Crust: In small bowl, combine ..."


Unnamed: 0,review_id,item_id,user_id,author_name,rating,review,date_submitted,date_modified
0,2,992,2008,gayg msft,5,better than any you can get at a restaurant!,2000-01-25 21:44:00+00:00,2000-01-25 21:44:00+00:00
1,7,4384,1634,Bill Hilbrich,4,"I cut back on the mayo, and made up the differ...",2001-10-17 16:49:59+00:00,2001-10-17 16:49:59+00:00


<class 'pandas.core.frame.DataFrame'>
Index: 28648 entries, 3 to 522039
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype              
---  ------                        --------------  -----              
 0   item_id                       28648 non-null  int32              
 1   name                          28648 non-null  object             
 2   user_id                       28648 non-null  int32              
 3   author_name                   28648 non-null  object             
 4   cook_time                     28648 non-null  object             
 5   prep_time                     28648 non-null  object             
 6   total_time                    28648 non-null  object             
 7   date_published                28648 non-null  datetime64[us, UTC]
 8   description                   28648 non-null  object             
 9   images                        28648 non-null  object             
 10  recipe_category               28648 no

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401982 entries, 0 to 1401981
Data columns (total 8 columns):
 #   Column          Non-Null Count    Dtype              
---  ------          --------------    -----              
 0   review_id       1401982 non-null  int32              
 1   item_id         1401982 non-null  int32              
 2   user_id         1401982 non-null  int32              
 3   author_name     1401982 non-null  object             
 4   rating          1401982 non-null  int32              
 5   review          1401982 non-null  object             
 6   date_submitted  1401982 non-null  datetime64[us, UTC]
 7   date_modified   1401982 non-null  datetime64[us, UTC]
dtypes: datetime64[us, UTC](2), int32(4), object(2)
memory usage: 64.2+ MB


None

In [7]:
df_reviews.describe()

Unnamed: 0,review_id,item_id,user_id,rating
count,1401982.0,1401982.0,1401982.0,1401982.0
mean,817973.9,152641.2,155863800.0,4.407951
std,528082.1,130111.2,530511100.0,1.272012
min,2.0,38.0,1533.0,0.0
25%,374386.2,47038.75,133680.0,4.0
50%,771780.5,109327.0,330545.0,5.0
75%,1204126.0,231876.8,818359.0,5.0
max,2090347.0,541298.0,2002902000.0,5.0


In [8]:
df_recipes.describe()

Unnamed: 0,item_id,user_id,aggregated_rating,review_count,calories,fat_content,saturated_fat_content,cholesterol_content,sodium_content,carbohydrate_content,fiber_content,sugar_content,protein_content,recipe_servings
count,28648.0,28648.0,28648.0,28648.0,28648.0,28648.0,28648.0,28648.0,28648.0,28648.0,28648.0,28648.0,28648.0,28648.0
mean,297214.726578,13920190.0,4.616029,4.885751,339.824033,16.794097,6.623653,61.965781,517.626368,36.376941,2.694586,15.52974,11.805644,13.245008
std,137276.209243,159754900.0,0.693318,24.056908,550.134231,40.842238,13.806888,119.678194,4441.722455,63.201012,7.166734,42.076131,21.291603,194.035461
min,41.0,27.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,195066.25,166642.0,4.5,1.0,147.6,4.8,1.3,6.1,114.1,13.8,0.7,2.5,2.9,4.0
50%,324204.0,359047.5,5.0,2.0,251.7,10.5,3.8,32.5,260.6,27.1,1.6,7.0,5.7,8.0
75%,402770.25,779699.0,5.0,4.0,407.225,20.2,8.2,81.5,558.725,44.2,3.2,18.5,15.0,12.0
max,540899.0,2002451000.0,5.0,2273.0,41770.2,4701.1,841.9,9167.2,704129.6,4320.9,835.7,3623.9,1802.9,32767.0


In [9]:
# Keywords unicas
def get_unique_keywords(df):
    """Obtener keywords unicas de un DataFrame"""
    if "keywords" in df.columns:
        keywords = df["keywords"].dropna().explode().unique()
        return [str(k).lower().replace(" ", "_") for k in keywords]
    return []
unique_keywords = get_unique_keywords(df_recipes)
print(f"✅ Keywords unicas encontradas: {len(unique_keywords)}")
unique_keywords

✅ Keywords unicas encontradas: 272


['beans',
 'vegetable',
 'low_cholesterol',
 'weeknight',
 'broil/grill',
 'oven',
 'dessert',
 'stove_top',
 '<_4_hours',
 'fruit',
 'nuts',
 'berries',
 '<_60_mins',
 'breads',
 'healthy',
 'bread_machine',
 'for_large_groups',
 'small_appliance',
 'easy',
 'poultry',
 'meat',
 'sweet',
 'cookie_&_brownie',
 '<_30_mins',
 'low_protein',
 'winter',
 'christmas',
 'canning',
 'black_beans',
 'brazilian',
 'south_american',
 'free_of...',
 'spicy',
 'lunch/snacks',
 'no_cook',
 'summer',
 'european',
 "st._patrick's_day",
 'apple',
 'onions',
 'lentil',
 'lactose_free',
 'egg_free',
 'kosher',
 'high_in...',
 'potluck',
 'mexican',
 'rice',
 'cheese',
 'southwestern_u.s.',
 'coconut',
 'potato',
 '<_15_mins',
 'chinese',
 'asian',
 'scottish',
 'refrigerator',
 'breakfast',
 'thanksgiving',
 'brunch',
 'vegan',
 'roast',
 'corn',
 'kid_friendly',
 'very_low_carbs',
 'pineapple',
 'tropical_fruits',
 'tex_mex',
 'microwave',
 'cherries',
 'spring',
 'freezer',
 'new_zealand',
 'australia

In [10]:
# Ingredientes unicos
def get_unique_ingredients(df):
    """Obtener ingredientes unicos de un DataFrame"""
    if "recipe_ingredient_parts" in df.columns:
        ingredients = df["recipe_ingredient_parts"].dropna().explode().unique()
        return [str(i).lower().replace(" ", "_") for i in ingredients]
    return []
unique_ingredients = get_unique_ingredients(df_recipes)
print(f"✅ Ingredientes unicos encontrados: {len(unique_ingredients)}")
unique_ingredients

✅ Ingredientes unicos encontrados: 3708


['extra_firm_tofu',
 'eggplant',
 'zucchini',
 'mushrooms',
 'soy_sauce',
 'low_sodium_soy_sauce',
 'olive_oil',
 'maple_syrup',
 'honey',
 'red_wine_vinegar',
 'lemon_juice',
 'garlic_cloves',
 'mustard_powder',
 'black_pepper',
 'graham_cracker_crumbs',
 'sugar',
 'butter',
 'cornstarch',
 'salt',
 'milk',
 'vanilla_extract',
 'water',
 'gelatin',
 'rum',
 'cream_of_tartar',
 'carrots',
 'eggs',
 'white_sugar',
 'all-purpose_flour',
 'baking_powder',
 'baking_soda',
 'cinnamon',
 'nutmeg',
 'golden_raisin',
 "confectioners'_sugar",
 'cream_cheese',
 'light_corn_syrup',
 'almond_paste',
 'flour',
 'egg',
 'corn_syrup',
 'vanilla',
 'brown_sugar',
 'blueberries',
 'buttermilk',
 'margarine',
 'bread_flour',
 'rolled_oats',
 'active_dry_yeast',
 'bread_machine_yeast',
 'plain_flour',
 'golden_syrup',
 'bicarbonate_of_soda',
 'pepper',
 'cayenne',
 'bock_beer',
 'unsalted_butter',
 'cracked_pepper',
 'whole_wheat_flour',
 'banana',
 'chocolate_chips',
 'powdered_sugar',
 'pears',
 'brand

# Modelo Base

In [11]:
import pickle
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k


class FoodcomRecommender:
    def __init__(self,
                 df_reviews: pd.DataFrame,
                 min_ratings_per_user: int = 5,
                 min_ratings_per_item: int = 5,
                 test_percentage: float = 0.25,
                 no_components: int = 30,
                 learning_rate: float = 0.05,
                 loss: str = 'warp',
                 random_state: int = 42,
                 num_threads: int = 4,
                 verbose: bool = False):
        self.verbose = verbose

        # Filtrado
        user_counts = df_reviews['user_id'].value_counts()
        item_counts = df_reviews['item_id'].value_counts()
        keep_users  = user_counts[user_counts >= min_ratings_per_user].index
        keep_items  = item_counts[item_counts >= min_ratings_per_item].index
        self.df = df_reviews[
            df_reviews['user_id'].isin(keep_users) &
            df_reviews['item_id'].isin(keep_items)
        ].copy()

        # Dataset
        self.dataset = Dataset()
        self.dataset.fit(
            users=self.df['user_id'].unique(),
            items=self.df['item_id'].unique()
        )

        # Interactions
        interaction_tuples = self.df[['user_id','item_id','rating']].to_numpy()
        self.interactions, _ = self.dataset.build_interactions(interaction_tuples)

        # Split
        self.train, self.test = random_train_test_split(
            self.interactions,
            test_percentage=test_percentage,
            random_state=np.random.RandomState(random_state)
        )

        # Modelo
        self.model = LightFM(
            loss=loss,
            no_components=no_components,
            learning_rate=learning_rate,
            random_state=np.random.RandomState(random_state)
        )
        self.num_threads = num_threads

    def fit(self, epochs: int = 10):
        """Entrena el modelo WARP sin sample_weight."""
        self.model.fit(
            self.train,
            epochs=epochs,
            num_threads=self.num_threads,
            verbose=self.verbose
        )

    def evaluate(self, k: int = 10) -> dict:
        prec = precision_at_k(self.model, self.test,
                              train_interactions=self.train,
                              k=k, num_threads=self.num_threads).mean()
        rec = recall_at_k(self.model, self.test,
                          train_interactions=self.train,
                          k=k, num_threads=self.num_threads).mean()
        return {f'precision@{k}': prec, f'recall@{k}': rec}

    def recommend(self, user_id, num_items: int = 10) -> pd.DataFrame:
        n_users, n_items = self.dataset.interactions_shape()
        scores = self.model.predict(user_id,
                                    np.arange(n_items),
                                    num_threads=self.num_threads)
        _, _, idx_to_item, _ = self.dataset.mapping()
        inv_map = {v: k for k, v in idx_to_item.items()}
        top_idx = np.argsort(-scores)[:num_items]
        return pd.DataFrame({
            'item_id': [inv_map[i] for i in top_idx],
            'score':     scores[top_idx]
        })

    def save(self, path: str):
        payload = {
            'model': self.model,
            'dataset': self.dataset,
            'train': self.train,
            'test': self.test,
            'num_threads': self.num_threads,
            'verbose': self.verbose,
        }
        with open(path, 'wb') as f:
            pickle.dump(payload, f)
        if self.verbose:
            print(f"[FoodcomRecommender] Guardado en '{path}'")

    @classmethod
    def load(cls, path: str, verbose: bool = False):
        with open(path, 'rb') as f:
            payload = pickle.load(f)
        rec = cls.__new__(cls)
        rec.model        = payload['model']
        rec.dataset      = payload['dataset']
        rec.train        = payload['train']
        rec.test         = payload['test']
        rec.num_threads  = payload['num_threads']
        rec.verbose      = verbose
        if verbose:
            print(f"[FoodcomRecommender] Cargado desde '{path}'")
        return rec


In [13]:
fcrec_base = FoodcomRecommender(df_reviews[['user_id', 'item_id', 'rating']],
                         min_ratings_per_user=10,
                         min_ratings_per_item=10,
                         test_percentage=0.2,
                         verbose=True)
fcrec_base.fit(epochs=20)
display(fcrec_base.evaluate(k=10))

Epoch: 100%|██████████| 20/20 [00:04<00:00,  4.78it/s]


{'precision@10': np.float32(0.012692364),
 'recall@10': np.float64(0.023115417003666976)}

In [None]:
from itertools import product


# Optimización de hiperparámetros
def search_hyperparameters(
    df_reviews: pd.DataFrame,
    parameters: dict,
    min_ratings_per_user: int = 5,
    min_ratings_per_item: int = 5,
    test_percentage: float = 0.25,
    num_threads: int = 4,
    epochs: int = 15,
    verbose: bool = False,
):
    best_score = -1
    best_params = {}

    for params in tqdm(
        list(product(*parameters.values())), desc="Buscando hiperparámetros"
    ):
        rec = FoodcomRecommender(
            df_reviews,
            min_ratings_per_user=min_ratings_per_user,
            min_ratings_per_item=min_ratings_per_item,
            test_percentage=test_percentage,
            num_threads=num_threads,
            verbose=verbose,
            # Unpacking los parámetros como kwargs
            **dict(zip(parameters.keys(), params)),
        )
        rec.fit(epochs=epochs)
        score = rec.evaluate(k=10)["precision@10"]
        if score > best_score:
            best_score = score
            best_params = dict(zip(parameters.keys(), params))
            if verbose:
                print(f"Score: {best_score:.4f}* | Params: {best_params}")

    return best_params, best_score


# Parámetros a buscar
parameters = {
    "no_components": np.arange(1, 31, 4),
    "learning_rate": np.array([1e-2, 1e-3, 1e-4, 1e-5]),
}
# Búsqueda de hiperparámetros
best_params, best_score = search_hyperparameters(
    df_reviews,
    parameters,
    min_ratings_per_user=5,
    min_ratings_per_item=5,
    test_percentage=0.2,
    num_threads=8,
    epochs=15,
    verbose=False,
)
print(f"Mejores parámetros: {best_params} con score: {best_score:.4f}")

Buscando hiperparámetros:   0%|          | 0/32 [00:00<?, ?it/s]

Mejores parámetros: {'no_components': np.int64(29), 'learning_rate': np.float64(0.01)} con score: 0.0101


# Modelo con metadatos

In [22]:
# Modelo con metadatos basado en Food.com
class FoodcomRecommenderWithMetadata:
    def __init__(
        self,
        df_reviews: pd.DataFrame,
        df_recipes: pd.DataFrame,
        min_ratings_per_user: int = 5,
        min_ratings_per_item: int = 5,
        test_percentage: float = 0.25,
        no_components: int = 30,
        learning_rate: float = 0.05,
        loss: str = "warp",
        random_state: int = 42,
        num_threads: int = 4,
        verbose: bool = False,
    ):
        self.verbose = verbose

        # Filtrado
        user_counts = df_reviews["user_id"].value_counts()
        item_counts = df_reviews["item_id"].value_counts()
        keep_users = user_counts[user_counts >= min_ratings_per_user].index
        keep_items = item_counts[item_counts >= min_ratings_per_item].index

        self.df_reviews = df_reviews[
            df_reviews["user_id"].isin(keep_users)
            & df_reviews["item_id"].isin(keep_items)
        ].copy()

        # Filtrar recipes para que coincidan con items válidos
        self.df_recipes = df_recipes[df_recipes["item_id"].isin(keep_items)].copy()

        # Crear features de items basadas en metadatos de recetas
        self.item_features_map = self._create_item_features()

        # Dataset
        self.dataset = Dataset()
        all_item_features = set()
        for features in self.item_features_map.values():
            all_item_features.update(features)

        self.dataset.fit(
            users=self.df_reviews["user_id"].unique(),
            items=self.df_reviews["item_id"].unique(),
            item_features=list(all_item_features),
        )

        # Interactions
        interaction_tuples = self.df_reviews[
            ["user_id", "item_id", "rating"]
        ].to_numpy()
        self.interactions, _ = self.dataset.build_interactions(interaction_tuples)

        # Item features - only for items that exist in the dataset
        valid_items = set(self.df_reviews["item_id"].unique())
        item_feat_tuples = [
            (item_id, features)
            for item_id, features in self.item_features_map.items()
            if item_id in valid_items
        ]
        self.item_features = self.dataset.build_item_features(item_feat_tuples)

        # Split
        self.train, self.test = random_train_test_split(
            self.interactions,
            test_percentage=test_percentage,
            random_state=np.random.RandomState(random_state),
        )

        # Modelo
        self.model = LightFM(
            loss=loss,
            no_components=no_components,
            learning_rate=learning_rate,
            random_state=np.random.RandomState(random_state),
        )
        self.num_threads = num_threads

    def _create_item_features(self):
        """Crear features basadas en metadatos de recetas"""
        features_map = {}

        for _, recipe in self.df_recipes.iterrows():
            features = []

            # Features categóricas
            if pd.notna(recipe.get("recipe_category")):
                category = str(recipe["recipe_category"]).lower().replace(" ", "_")
                features.append(f"category_{category}")

            # Keywords
            keywords = recipe.get("keywords")
            if isinstance(keywords, np.ndarray):
                keyword_buckets = self._bucket_keywords(recipe["keywords"])
                for keyword_bucket in keyword_buckets:
                    # Normalizar keywords
                    keyword_bucket = str(keyword_bucket).lower().replace(" ", "_")
                    features.append(keyword_bucket)

            # Ingredientes
            #ingredients = recipe.get("recipe_ingredient_parts")
            #if isinstance(ingredients, np.ndarray):
            #    ingredients = [str(i).lower().replace(" ", "_") for i in ingredients]
            #    for ingredient in ingredients:
            #        features.append(f"ingredient_{ingredient}")

            # Features numéricas bucketizadas
            if pd.notna(recipe.get('calories')):
               cal_bucket = self._bucket_calories(recipe['calories'])
               features.append(f"calories_{cal_bucket}")

            if pd.notna(recipe.get('fat_content')):
               fat_bucket = self._bucket_fat(recipe['fat_content'])
               features.append(f"fat_{fat_bucket}")

            if pd.notna(recipe.get('protein_content')):
               protein_bucket = self._bucket_protein(recipe['protein_content'])
               features.append(f"protein_{protein_bucket}")

            # Features de tiempo de cocción
            # if pd.notna(recipe.get('total_time')):
            #    time_bucket = self._bucket_time(recipe['total_time'])
            #    features.append(f"time_{time_bucket}")

            # Rating promedio de la receta
            if pd.notna(recipe.get("aggregated_rating")):
                rating_bucket = self._bucket_rating(recipe["aggregated_rating"])
                features.append(f"avg_rating_{rating_bucket}")

            # Si no hay features, agregar una por defecto
            if not features:
                features.append("no_metadata")

            features_map[recipe["item_id"]] = features

        return features_map

    def _bucket_keywords(self, keywords):
        """Bucketizar keywords"""
        # Diet restriction keywords categorization
        vegan_keywords = [
            "vegan",
            "soy/tofu",
            "tempeh",
            "beans",
            "black_beans",
            "lentil",
            "vegetable",
            "fruit",
            "nuts",
            "grains",
            "oatmeal",
        ]

        vegetarian_keywords = [
            "vegetarian",
            "egg_free",
            "dairy_free_foods",
            "cheese",
            "nuts",
            "beans",
            "vegetable",
            "fruit",
            "grains",
        ]

        gluten_free_keywords = [
            "free_of...",
            "rice",
            "long_grain_rice",
            "short_grain_rice",
            "white_rice",
            "brown_rice",
            "medium_grain_rice",
        ]

        low_carb_keywords = [
            "very_low_carbs",
            "low_cholesterol",
            "high_protein",
            "meat",
            "poultry",
            "chicken",
            "beef_organ_meats",
            "steak",
            "pork",
            "lamb/sheep",
        ]

        dairy_free_keywords = [
            "dairy_free_foods",
            "lactose_free",
            "egg_free",
            "soy/tofu",
            "coconut",
        ]

        kosher_keywords = ["kosher", "no_shell_fish", "beef_liver", "chicken_livers"]

        healthy_keywords = [
            "healthy",
            "low_cholesterol",
            "high_fiber",
            "low_protein",
            "high_protein",
            "fruit",
            "vegetable",
            "grains",
        ]

        allergen_free_keywords = [
            "egg_free",
            "lactose_free",
            "dairy_free_foods",
            "no_shell_fish",
            "free_of...",
        ]
        if isinstance(keywords, np.ndarray):
            keywords = [str(k).lower().replace(" ", "_") for k in keywords]

            buckets = {
                "vegan": any(k in keywords for k in vegan_keywords),
                "vegetarian": any(k in keywords for k in vegetarian_keywords),
                "gluten_free": any(k in keywords for k in gluten_free_keywords),
                "low_carb": any(k in keywords for k in low_carb_keywords),
                "dairy_free": any(k in keywords for k in dairy_free_keywords),
                "kosher": any(k in keywords for k in kosher_keywords),
                "healthy": any(k in keywords for k in healthy_keywords),
                "allergen_free": any(k in keywords for k in allergen_free_keywords),
            }

            return [f"keyword_{k}" for k, v in buckets.items() if v]
        return []

    def _bucket_calories(self, calories):
        """Bucketizar calorías"""
        if calories < 200:
            return "low"
        elif calories < 400:
            return "medium"
        elif calories < 600:
            return "high"
        else:
            return "very_high"

    def _bucket_fat(self, fat):
        """Bucketizar contenido de grasa"""
        if fat < 10:
            return "low"
        elif fat < 20:
            return "medium"
        elif fat < 30:
            return "high"
        else:
            return "very_high"

    def _bucket_protein(self, protein):
        """Bucketizar contenido de proteína"""
        if protein < 10:
            return "low"
        elif protein < 20:
            return "medium"
        elif protein < 30:
            return "high"
        else:
            return "very_high"

    def _bucket_rating(self, rating):
        """Bucketizar rating promedio"""
        if rating < 3.0:
            return "low"
        elif rating < 4.0:
            return "medium"
        elif rating < 4.5:
            return "high"
        else:
            return "excellent"

    def fit(self, epochs: int = 10):
        """Entrena el modelo con metadatos"""
        self.model.fit(
            self.train,
            item_features=self.item_features,
            epochs=epochs,
            num_threads=self.num_threads,
            verbose=self.verbose,
        )

    def evaluate(self, k: int = 10) -> dict:
        prec = precision_at_k(
            self.model,
            self.test,
            train_interactions=self.train,
            item_features=self.item_features,
            k=k,
            num_threads=self.num_threads,
        ).mean()
        rec = recall_at_k(
            self.model,
            self.test,
            train_interactions=self.train,
            item_features=self.item_features,
            k=k,
            num_threads=self.num_threads,
        ).mean()
        return {f"precision@{k}": prec, f"recall@{k}": rec}

    def recommend(self, user_id, num_items: int = 10) -> pd.DataFrame:
        n_users, n_items = self.dataset.interactions_shape()
        scores = self.model.predict(
            user_id,
            np.arange(n_items),
            item_features=self.item_features,
            num_threads=self.num_threads,
        )
        user_id_map, user_features_map, item_id_map, item_features_map = (
            self.dataset.mapping()
        )
        inv_map = {v: k for k, v in item_id_map.items()}
        top_idx = np.argsort(-scores)[:num_items]
        return pd.DataFrame(
            {"item_id": [inv_map[i] for i in top_idx], "score": scores[top_idx]}
        )


fcrec_meta = FoodcomRecommenderWithMetadata(
    df_reviews[["user_id", "item_id", "rating"]],
    df_recipes,
    min_ratings_per_user=5,
    min_ratings_per_item=5,
    test_percentage=0.2,
    verbose=True,
)
fcrec_base.fit(epochs=20)
display(fcrec_base.evaluate(k=10))

Epoch: 100%|██████████| 20/20 [00:04<00:00,  4.74it/s]


{'precision@10': np.float32(0.012135425),
 'recall@10': np.float64(0.02183844790053365)}

In [None]:
from itertools import product


# Optimización de hiperparámetros
def search_hyperparameters_meta(
    df_reviews: pd.DataFrame,
    df_recipes: pd.DataFrame,
    parameters: dict,
    min_ratings_per_user: int = 5,
    min_ratings_per_item: int = 5,
    test_percentage: float = 0.25,
    num_threads: int = 4,
    epochs: int = 15,
    verbose: bool = False,
):
    best_score = -1
    best_params = {}

    for params in tqdm(
        list(product(*parameters.values())), desc="Buscando hiperparámetros"
    ):
        rec = FoodcomRecommenderWithMetadata(
            df_reviews,
            df_recipes,
            min_ratings_per_user=min_ratings_per_user,
            min_ratings_per_item=min_ratings_per_item,
            test_percentage=test_percentage,
            num_threads=num_threads,
            verbose=verbose,
            # Unpacking los parámetros como kwargs
            **dict(zip(parameters.keys(), params)),
        )
        rec.fit(epochs=epochs)
        score = rec.evaluate(k=10)["precision@10"]
        if score > best_score:
            best_score = score
            best_params = dict(zip(parameters.keys(), params))
            if verbose:
                print(f"Score: {best_score:.4f}* | Params: {best_params}")

    return best_params, best_score


# Parámetros a buscar
parameters = {
    "no_components": np.arange(1, 31, 4),
    "learning_rate": np.array([1e-2, 1e-3, 1e-4, 1e-5]),
}
# Búsqueda de hiperparámetros
best_params_meta, best_score_meta = search_hyperparameters_meta(
    df_reviews,
    df_recipes,
    parameters,
    min_ratings_per_user=5,
    min_ratings_per_item=5,
    test_percentage=0.2,
    num_threads=8,
    epochs=15,
    verbose=False,
)
print(f"Mejores parámetros: {best_params_meta} con score: {best_score_meta:.4f}")

Buscando hiperparámetros:   0%|          | 0/32 [00:00<?, ?it/s]

Mejores parámetros: {'no_components': np.int64(25), 'learning_rate': np.float64(0.01)} con score: 0.0097


In [30]:
# Crear y entrenar el modelo con metadatos
print("Creando modelos...")
fcrec_base = FoodcomRecommender(
    df_reviews[["user_id", "item_id", "rating"]],
    min_ratings_per_user=5,
    min_ratings_per_item=5,
    test_percentage=0.2,
    num_threads=8,
    verbose=True,
    **best_params,  # Usar los mejores parámetros encontrados
)
fcrec_meta = FoodcomRecommenderWithMetadata(
    df_reviews[["user_id", "item_id", "rating"]],
    df_recipes,
    min_ratings_per_user=5,
    min_ratings_per_item=5,
    test_percentage=0.2,
    num_threads=8,
    verbose=True,
    **best_params_meta,  # Usar los mejores parámetros encontrados
)

print("Entrenando modelos...")
fcrec_base.fit(epochs=50)
fcrec_meta.fit(epochs=50)
print("Evaluando modelos...")
results_base = fcrec_base.evaluate(k=10)
print(f"Resultados modelo base: {results_base}")
results_meta = fcrec_meta.evaluate(k=10)
print(f"Resultados con metadatos: {results_meta}")

# Comparar con modelo base
print(
    f"Mejora precision: {results_meta['precision@10'] - results_base['precision@10']:.4f}"
)
print(f"Mejora recall: {results_meta['recall@10'] - results_base['recall@10']:.4f}")
print(
    f"Mejora porcentual precision: {(results_meta['precision@10'] - results_base['precision@10']) / results_base['precision@10'] * 100:.2f}%"
)
print(
    f"Mejora porcentual recall: {(results_meta['recall@10'] - results_base['recall@10']) / results_base['recall@10'] * 100:.2f}%"
)

Creando modelos...
Entrenando modelos...


Epoch: 100%|██████████| 50/50 [00:10<00:00,  4.59it/s]
Epoch: 100%|██████████| 50/50 [00:20<00:00,  2.44it/s]


Evaluando modelos...
Resultados modelo base: {'precision@10': np.float32(0.0100793755), 'recall@10': np.float64(0.021877840708197465)}
Resultados con metadatos: {'precision@10': np.float32(0.010256751), 'recall@10': np.float64(0.02199487331789366)}
Mejora precision: 0.0002
Mejora recall: 0.0001
Mejora porcentual precision: 1.76%
Mejora porcentual recall: 0.53%
