# Modelo V1

In [1]:
# FLAGS
FLAG_DATASET_DL = False

## 1. Carga de datos y librerías

In [2]:
# Actualizar pip
%pip install --upgrade pip
# Instalar versión específica de numpy
%pip install numpy~=1.26
# Dependencias para graficar y manipular datos
%pip install pandas matplotlib tqdm seaborn ipywidgets
# Dependencias para leer archivos Parquet
%pip install pyarrow fastparquet
# Para predicción
%pip install scikit-surprise scikit-learn lightfm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Importar librerías

# Manejo de datos y visualización
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm.auto import tqdm

# Surprise
from surprise import Dataset, Reader, KNNWithMeans, accuracy
from surprise.model_selection import train_test_split, cross_validate

# LightFM (recomendación basada en contenido)
from lightfm import LightFM
from scipy.sparse import coo_matrix
from sklearn.metrics import precision_score, recall_score

# Crear directorios
PATH_DATASETS = "datasets"
PATH_DATASETS_FOODCOM = os.path.join(PATH_DATASETS, "foodcom")
PATH_DATASETS_MEALRECPLUS = os.path.join(PATH_DATASETS, "mealrecplus")
PATH_MEALRECPLUS_META_DATA = os.path.join(PATH_DATASETS_MEALRECPLUS, "MealRecPlus-main/MealRec+/MealRec+H/meta_data/")
PATH_MEALRECPLUS_HEALTHINESS = os.path.join(PATH_DATASETS_MEALRECPLUS, "MealRecPlus-main/MealRec+/MealRec+H/healthiness/")
os.makedirs(PATH_DATASETS, exist_ok=True)

In [4]:
if FLAG_DATASET_DL:
  # Descargar "Recipes and Reviews" de Food.com
  os.system(f"curl -L -o {PATH_DATASETS_FOODCOM}.zip https://www.kaggle.com/api/v1/datasets/download/irkaal/foodcom-recipes-and-reviews")
  os.system(f"unzip -o {PATH_DATASETS_FOODCOM}.zip -d {PATH_DATASETS_FOODCOM}")
  # Eliminar el zip
  os.remove(f"{PATH_DATASETS_FOODCOM}.zip")

  # Cita: Dominio público (https://www.kaggle.com/datasets/irkaal/foodcom-recipes-and-reviews)

  # Descargar "MealRecPlus" de WUT-IDEA
  os.system(f"curl -L -o {PATH_DATASETS_MEALRECPLUS}.zip https://github.com/WUT-IDEA/MealRecPlus/archive/refs/heads/main.zip")
  os.system(f"unzip -o {PATH_DATASETS_MEALRECPLUS}.zip -d {PATH_DATASETS_MEALRECPLUS}")
  os.system(f"unzip -o {os.path.join(PATH_MEALRECPLUS_META_DATA, 'course')}.zip -d {os.path.join(PATH_MEALRECPLUS_META_DATA, 'course')}")
  os.system(f"unzip -o {os.path.join(PATH_MEALRECPLUS_META_DATA, 'user_course')}.zip -d {os.path.join(PATH_MEALRECPLUS_META_DATA, 'user_course')}")
  # Eliminar el zip
  os.remove(f"{PATH_DATASETS_MEALRECPLUS}.zip")
  os.remove(os.path.join(PATH_MEALRECPLUS_META_DATA, "course.zip"))
  os.remove(os.path.join(PATH_MEALRECPLUS_META_DATA, "user_course.zip"))
  # Cita: Ming Li, Lin Li, Xiaohui Tao, and Jimmy Xiangji Huang. 2024. MealRec+: A Meal Recommendation Dataset with Meal-Course Affiliation for Personal- ization and Healthiness. In Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR ’24), July 14–18, 2024, Washington, DC, USA. ACM, New York, NY, USA, 11 pages. https://doi.org/10.1145/3626772.3657857 (https://github.com/WUT-IDEA/MealRecPlus)

In [5]:
# Foodcom dataframes
df_foodcom_recipes = pd.read_parquet(os.path.join(PATH_DATASETS_FOODCOM, "recipes.parquet"))
df_foodcom_reviews = pd.read_parquet(os.path.join(PATH_DATASETS_FOODCOM, "reviews.parquet"))

In [6]:
# Recetas de Foodcom
df_foodcom_recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype              
---  ------                      --------------   -----              
 0   RecipeId                    522517 non-null  float64            
 1   Name                        522517 non-null  object             
 2   AuthorId                    522517 non-null  int32              
 3   AuthorName                  522517 non-null  object             
 4   CookTime                    439972 non-null  object             
 5   PrepTime                    522517 non-null  object             
 6   TotalTime                   522517 non-null  object             
 7   DatePublished               522517 non-null  datetime64[us, UTC]
 8   Description                 522512 non-null  object             
 9   Images                      522516 non-null  object             
 10  RecipeCategory              521766 non-null 

In [7]:
# Reviews de Foodcom
df_foodcom_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401982 entries, 0 to 1401981
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype              
---  ------         --------------    -----              
 0   ReviewId       1401982 non-null  int32              
 1   RecipeId       1401982 non-null  int32              
 2   AuthorId       1401982 non-null  int32              
 3   AuthorName     1401982 non-null  object             
 4   Rating         1401982 non-null  int32              
 5   Review         1401982 non-null  object             
 6   DateSubmitted  1401982 non-null  datetime64[us, UTC]
 7   DateModified   1401982 non-null  datetime64[us, UTC]
dtypes: datetime64[us, UTC](2), int32(4), object(2)
memory usage: 64.2+ MB


In [8]:
# MealRecPlus dataframes
df_mealrecplus_user_course = pd.read_csv(
    os.path.join(PATH_MEALRECPLUS_META_DATA, "user_course", "user_course.csv"),
    names=["user_id", "course_id", "rating", "dateLastModified"],
    header=None
)
# Convertir la columna 'rating' a tipo numérico, forzando los errores a NaN
df_mealrecplus_user_course['rating'] = pd.to_numeric(df_mealrecplus_user_course['rating'], errors='coerce')
df_mealrecplus_user_course.dropna(subset=['rating'], inplace=True)

# Cargar metadatos de platos
df_mealrecplus_course = pd.read_csv(os.path.join(PATH_MEALRECPLUS_META_DATA, "course", "course.csv"))

# Cargar los índices de usuarios y cursos
df_mealrecplus_user2index = pd.read_csv(os.path.join(PATH_MEALRECPLUS_META_DATA, "user2index.txt"), sep="\t", names=["user_id", "user_index"])
df_mealrecplus_course2index = pd.read_csv(os.path.join(PATH_MEALRECPLUS_META_DATA, "course2index.txt"), sep="\t", names=["course_id", "course_index"])

# Healthiness scores for MealRecPlus
df_mealrecplus_course_fsa = pd.read_csv(os.path.join(PATH_MEALRECPLUS_HEALTHINESS, "course_fsa.txt"), header=None, names=["fsa_score"])
df_mealrecplus_course_who = pd.read_csv(os.path.join(PATH_MEALRECPLUS_HEALTHINESS, "course_who.txt"), header=None, names=["who_score"])

# Puntajes por comida (meal)
df_mealrecplus_meal_fsa = pd.read_csv(os.path.join(PATH_MEALRECPLUS_HEALTHINESS, "meal_fsa.txt"), header=None, names=["fsa_score"])
df_mealrecplus_meal_who = pd.read_csv(os.path.join(PATH_MEALRECPLUS_HEALTHINESS, "meal_who.txt"), header=None, names=["who_score"])

# Puntajes promedio por usuario según sus comidas
df_mealrecplus_user_fsa = pd.read_csv(os.path.join(PATH_MEALRECPLUS_HEALTHINESS, "user_fsa.txt"), header=None, names=["fsa_score"])
df_mealrecplus_user_who = pd.read_csv(os.path.join(PATH_MEALRECPLUS_HEALTHINESS, "user_who.txt"), header=None, names=["who_score"])

  df_mealrecplus_user_course = pd.read_csv(


In [9]:
df_mealrecplus_user_course.info()

<class 'pandas.core.frame.DataFrame'>
Index: 151148 entries, 1 to 151148
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   user_id           151148 non-null  object 
 1   course_id         151148 non-null  object 
 2   rating            151148 non-null  float64
 3   dateLastModified  151148 non-null  object 
dtypes: float64(1), object(3)
memory usage: 5.8+ MB


In [10]:
df_mealrecplus_course.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7280 entries, 0 to 7279
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   course_id           7280 non-null   int64  
 1   course_name         7280 non-null   object 
 2   review_nums         7280 non-null   int64  
 3   category            7280 non-null   object 
 4   aver_rate           7280 non-null   float64
 5   image_url           7280 non-null   object 
 6   ingredients         7280 non-null   object 
 7   cooking_directions  7280 non-null   object 
 8   nutritions          7280 non-null   object 
 9   reviews             7280 non-null   object 
 10  tags                7278 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 625.8+ KB


In [11]:
df_mealrecplus_user2index.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1575 entries, 0 to 1574
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   user_id     1575 non-null   int64
 1   user_index  1575 non-null   int64
dtypes: int64(2)
memory usage: 24.7 KB


In [12]:
df_mealrecplus_course2index.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7280 entries, 0 to 7279
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   course_id     7280 non-null   int64
 1   course_index  7280 non-null   int64
dtypes: int64(2)
memory usage: 113.9 KB


In [13]:
df_mealrecplus_course_fsa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7280 entries, 0 to 7279
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   fsa_score  7280 non-null   int64
dtypes: int64(1)
memory usage: 57.0 KB


In [14]:
df_mealrecplus_course_who.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7280 entries, 0 to 7279
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   who_score  7280 non-null   int64
dtypes: int64(1)
memory usage: 57.0 KB


In [15]:
df_mealrecplus_meal_fsa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   fsa_score  3817 non-null   float64
dtypes: float64(1)
memory usage: 29.9 KB


In [16]:
df_mealrecplus_meal_who.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   who_score  3817 non-null   float64
dtypes: float64(1)
memory usage: 29.9 KB


In [17]:
df_mealrecplus_user_fsa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1575 entries, 0 to 1574
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   fsa_score  1575 non-null   float64
dtypes: float64(1)
memory usage: 12.4 KB


In [18]:
df_mealrecplus_user_who.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1575 entries, 0 to 1574
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   who_score  1575 non-null   float64
dtypes: float64(1)
memory usage: 12.4 KB


In [19]:
display(df_foodcom_recipes.head())
display(df_foodcom_reviews.head())

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38.0,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09 21:46:00+00:00,Make and share this Low-Fat Berry Blue Frozen ...,[https://img.sndimg.com/food/image/upload/w_55...,...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"[Toss 2 cups berries with sugar., Let stand fo..."
1,39.0,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29 13:12:00+00:00,Make and share this Biryani recipe from Food.com.,[https://img.sndimg.com/food/image/upload/w_55...,...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,[Soak saffron in warm milk for 5 minutes and p...
2,40.0,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05 19:52:00+00:00,This is from one of my first Good House Keepi...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"[Into a 1 quart Jar with tight fitting lid, pu..."
3,41.0,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03 14:54:00+00:00,This dish is best prepared a day in advance to...,[https://img.sndimg.com/food/image/upload/w_55...,...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"[Drain the tofu, carefully squeezing out exces..."
4,42.0,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19 06:19:00+00:00,Make and share this Cabbage Soup recipe from F...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"[Mix everything together and bring to a boil.,..."


Unnamed: 0,ReviewId,RecipeId,AuthorId,AuthorName,Rating,Review,DateSubmitted,DateModified
0,2,992,2008,gayg msft,5,better than any you can get at a restaurant!,2000-01-25 21:44:00+00:00,2000-01-25 21:44:00+00:00
1,7,4384,1634,Bill Hilbrich,4,"I cut back on the mayo, and made up the differ...",2001-10-17 16:49:59+00:00,2001-10-17 16:49:59+00:00
2,9,4523,2046,Gay Gilmore ckpt,2,i think i did something wrong because i could ...,2000-02-25 09:00:00+00:00,2000-02-25 09:00:00+00:00
3,13,7435,1773,Malarkey Test,5,easily the best i have ever had. juicy flavor...,2000-03-13 21:15:00+00:00,2000-03-13 21:15:00+00:00
4,14,44,2085,Tony Small,5,An excellent dish.,2000-03-28 12:51:00+00:00,2000-03-28 12:51:00+00:00


## 2. Preprocesamiento de datos

### 2.1. Food.com

En df_foodcom_reviews:
- "AuthorId" -> "user_id" (int)
- "RecipeId" -> "recipe_id" (int)
- "Rating" -> "rating" (float)

y mantenemos las columnas "DateSubmitted" y "ReviewId".

In [20]:
df_ratings = df_foodcom_reviews.rename(columns={
    'AuthorId': 'user_id',
    'RecipeId': 'recipe_id',
    'Rating'  : 'rating'
})[['user_id', 'recipe_id', 'rating', 'DateSubmitted', 'ReviewId']]

df_ratings['user_id']   = df_ratings['user_id'].astype(int)
df_ratings['recipe_id'] = df_ratings['recipe_id'].astype(int)
df_ratings['rating']    = df_ratings['rating'].astype(float)

# Mostrar información del dataframe de ratings
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401982 entries, 0 to 1401981
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype              
---  ------         --------------    -----              
 0   user_id        1401982 non-null  int64              
 1   recipe_id      1401982 non-null  int64              
 2   rating         1401982 non-null  float64            
 3   DateSubmitted  1401982 non-null  datetime64[us, UTC]
 4   ReviewId       1401982 non-null  int32              
dtypes: datetime64[us, UTC](1), float64(1), int32(1), int64(2)
memory usage: 48.1 MB


En df_foodcom_recipes:
- "RecipeId" -> "recipe_id" (int)

y mantenemos las columnas, anotando como filtro las nutricionales.

In [21]:
df_recipes = df_foodcom_recipes.rename(columns={
    'RecipeId': 'recipe_id'
}).copy()

df_recipes['recipe_id'] = df_recipes['recipe_id'].astype(int)

nutri_cols = [
    'Calories',
    'FatContent',
    'SaturatedFatContent',
    'CholesterolContent',
    'SodiumContent',
    'CarbohydrateContent',
    'FiberContent',
    'SugarContent',
    'ProteinContent'
]

### 2.2. Filtrado de datos

In [22]:
# Filtrado inicial para evitar dataset muy disperso

# Contemos cuántos ratings por usuario y por receta
user_counts   = df_ratings['user_id'].value_counts()
recipe_counts = df_ratings['recipe_id'].value_counts()

# Nos quedamos solo con:
# - Usuarios con >= 5 ratings
# - Recetas con >= 5 ratings
users_filt   = user_counts[user_counts >= 5].index
recipes_filt = recipe_counts[recipe_counts >= 5].index

df_ratings_filt = df_ratings[
    df_ratings['user_id'].isin(users_filt) & 
    df_ratings['recipe_id'].isin(recipes_filt)
].copy()

print(f"Ratings originales: {len(df_ratings):,} | Filtrados: {len(df_ratings_filt):,}")


Ratings originales: 1,401,982 | Filtrados: 783,379


### 2.3 Merge de metadatos

Por ahora, solo usaremos una flag para probar el prototipo:
- `is_vegan` (bool): indica si la receta es vegana o no.

In [23]:
# Merge con metadatos de receta

# Creamos en df_recipes la flag is_vegan:
df_recipes['RecipeCategory'] = df_recipes['RecipeCategory'].fillna('').astype(str)
df_recipes['is_vegan'] = df_recipes['RecipeCategory'].str.contains(
    'Vegan', case=False, na=False
).astype(int)

# Hacemos merge de df_ratings_filt con df_recipes
df_all = df_ratings_filt.merge(
    df_recipes[['recipe_id', *nutri_cols,'is_vegan']],
    on='recipe_id',
    how='left'
)

# Comprobamos NA en nutritional cols (si hay, se rellenan con 0)
df_all[nutri_cols] = df_all[nutri_cols].fillna(0.0)
df_all['is_vegan'] = df_all['is_vegan'].fillna(0).astype(int)

In [24]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783379 entries, 0 to 783378
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype              
---  ------               --------------   -----              
 0   user_id              783379 non-null  int64              
 1   recipe_id            783379 non-null  int64              
 2   rating               783379 non-null  float64            
 3   DateSubmitted        783379 non-null  datetime64[us, UTC]
 4   ReviewId             783379 non-null  int32              
 5   Calories             783379 non-null  float64            
 6   FatContent           783379 non-null  float64            
 7   SaturatedFatContent  783379 non-null  float64            
 8   CholesterolContent   783379 non-null  float64            
 9   SodiumContent        783379 non-null  float64            
 10  CarbohydrateContent  783379 non-null  float64            
 11  FiberContent         783379 non-null  float64            
 12  Su

## 3. Baselines

### 3.1. Colaborative Based

In [25]:
# ================================
# Baseline
# ================================


## 4. Modelo

Se probará un prototipo hibrido basado en LightFM

In [68]:
# ========================================================
# Híbrido con LightFM (colaborative + features nutricion)
# ========================================================

# Creamos un índice numérico para usuarios y para recetas (solo del df_all filtrado)
user_cat   = df_all['user_id'].astype('category')
item_cat   = df_all['recipe_id'].astype('category')
user_map   = dict(enumerate(user_cat.cat.categories))
item_map   = dict(enumerate(item_cat.cat.categories))
inv_user_map = {v:k for k,v in user_map.items()}
inv_item_map = {v:k for k,v in item_map.items()}

n_users = len(user_map)
n_items = len(item_map)

# Construimos la matriz de interacción (sparse COO)
rows = df_all['user_id'].map(inv_user_map).values
cols = df_all['recipe_id'].map(inv_item_map).values
data = df_all['rating'].values
interaction_matrix = coo_matrix((data, (rows, cols)), shape=(n_users, n_items))

# Preparamos features de ítem:
ordered_recipe_ids = item_cat.cat.categories
recipes_indexed = df_recipes.set_index('recipe_id')
feature_columns = ['is_vegan', 'Calories']

# Creamos un DataFrame con las características de los ítems
item_features_df = recipes_indexed.reindex(ordered_recipe_ids)[feature_columns]
item_features_df = item_features_df.fillna(0.0)

# Convertimos a sparse matrix
item_features_mat = coo_matrix(item_features_df.values)

# Entrenamos un LightFM con WARP (buen para ranking implícito)
model_lfm = LightFM(loss='warp')
model_lfm.fit(interaction_matrix, 
              item_features=item_features_mat, 
              epochs=10, 
              num_threads=8,
              verbose=True)

# Creamos función para recomendar top-K con LightFM
def recommend_lightfm(model: LightFM, user_id, k=10):
    """
    Dado un user_id (entero original), devolvemos top-k recipe_ids.
    """
    uidx = inv_user_map[user_id]
    scores = model.predict(user_ids=[uidx for _ in range(n_items)],
                           item_ids=np.arange(n_items),
                           item_features=item_features_mat,
                           num_threads=8)
    top_items = np.argsort(-scores)[:k]
    return [item_map[idx] for idx in top_items]

# Ejemplo: recomendaciones para un usuario de prueba
test_user_id = df_all['user_id'].sample(1).iloc[0]
print("Usuario de prueba:", test_user_id)
recomendations = recommend_lightfm(model_lfm, test_user_id, k=10)
print("Recomendaciones LightFM (IDs):", recomendations)
# Mostrar las recetas recomendadas
recommended_recipes = df_recipes[df_recipes['recipe_id'].isin(recomendations)]
print("Recetas recomendadas:")
display(recommended_recipes[['recipe_id', 'Name', 'Description', 'is_vegan']])

Epoch: 100%|██████████| 10/10 [00:11<00:00,  1.12s/it]

Usuario de prueba: 311003
Recomendaciones LightFM (IDs): [4284, 248608, 441092, 53903, 500434, 411314, 363521, 26191, 46879, 477981]
Recetas recomendadas:





Unnamed: 0,recipe_id,Name,Description,is_vegan
2692,4284,Pumpkin Puree,Make and share this Pumpkin Puree recipe from ...,1
22721,26191,The Best All Purpose Cleaner,This is by far the best all purpose cleaner I'...,0
42961,46879,Dates Stuffed with Cream Cheese and Pecans,This is a recipe that my mother makes as a qui...,0
49830,53903,Breastmilk Butter,Make and share this Breastmilk Butter recipe f...,1
238793,248608,Sweet Potato Chews for Dogs,One of my coworkers made these for her Yorkie ...,1
350586,363521,Clorox Anywhere Spray Copycat,Make and share this Clorox Anywhere Spray Copy...,0
396795,411314,A Perfect Eye of Round Roast Beef,"Without fail, my mother-in-law made an eye of ...",0
425339,441092,How to Season Cast Iron Pans,I use this to make my cast iron pans perfect a...,1
460979,477981,Fried Fresh Corn With Bacon Grease,Make and share this Fried Fresh Corn With Baco...,0
482634,500434,Tony Chachere's Creole Seasoning (Copycat),Make and share this Tony Chachere's Creole Sea...,1


## 5. Evaluación

In [70]:
# Definimos un split “leave-one-out” manual: 
# Tomamos aleatoriamente un rating por usuario para test, y el resto para train.
test_interactions = []
train_interactions = []

np.random.seed(1)  # Para reproducibilidad
for u in df_all['user_id'].unique():
    df_u = df_all[df_all['user_id']==u]
    test_idx = np.random.choice(df_u.index, size=1, replace=False)
    for idx in df_u.index:
        if idx == test_idx:
            test_interactions.append((df_u.at[idx, 'user_id'], df_u.at[idx, 'recipe_id']))
        else:
            train_interactions.append((df_u.at[idx, 'user_id'], df_u.at[idx, 'recipe_id']))

# Creamos matrices de train/test en formato sparse
# para LightFM solo necesitamos la matriz binaria (interactions >0), 
# aunque podríamos normalizar ratings a 1/0.
def build_interaction_matrix(interactions, user_map, item_map, shape):
    rows, cols, vals = [], [], []
    for (u,i) in interactions:
        if u in inv_user_map and i in inv_item_map:
            rows.append(inv_user_map[u])
            cols.append(inv_item_map[i])
            vals.append(1.0)  # binario
    return coo_matrix((vals, (rows, cols)), shape=shape)

train_mat = build_interaction_matrix(train_interactions, user_map, item_map, (n_users, n_items))
test_mat  = build_interaction_matrix(test_interactions,  user_map, item_map, (n_users, n_items))

# Entrenamos LightFM en train_mat (dejamos fuera test_mat)
model_lfm_loo = LightFM(loss='warp')
model_lfm_loo.fit(train_mat, item_features=item_features_mat, epochs=10, num_threads=8, verbose=True)

# Funciones de Precision@K / Recall@K para LightFM
def precision_recall_at_k_lightfm(model: LightFM, train_mat, test_mat, k=10):
    n_users, n_items = train_mat.shape
    precisions = []
    recalls    = []
    
    for uidx in tqdm(range(n_users), desc="Calculando Precision/Recall"):
        # Items reales en test para este usuario
        true_items = set(test_mat.getrow(uidx).nonzero()[1])
        if len(true_items)==0:
            continue
        
        # Items que el usuario ya vio en train (para excluirlos)
        train_items = set(train_mat.getrow(uidx).nonzero()[1])
        
        # Score para todos los ítems
        scores = model.predict(user_ids=[uidx for _ in range(n_items)], 
                               item_ids=np.arange(n_items), 
                               item_features=item_features_mat,
                               num_threads=8)
        # Excluir ítems de train
        scores_filtered = [(i, s) for i, s in enumerate(scores) if i not in train_items]
        scores_filtered.sort(key=lambda x: x[1], reverse=True)
        
        # Tomamos top-k
        top_k = [i for (i,_) in scores_filtered[:k]]
        hits = len(set(top_k) & true_items)
        
        precisions.append(hits / k)
        recalls.append(hits / len(true_items))
    
    return np.mean(precisions), np.mean(recalls)

prec_lfm, rec_lfm = precision_recall_at_k_lightfm(model_lfm_loo, train_mat, test_mat, k=10)
print(f"LightFM (WARP) → Precision@10: {prec_lfm:.4f}  /  Recall@10: {rec_lfm:.4f}")

Epoch: 100%|██████████| 10/10 [00:10<00:00,  1.01s/it]


Calculando Precision/Recall:   0%|          | 0/27626 [00:00<?, ?it/s]

LightFM (WARP) → Precision@10: 0.0000  /  Recall@10: 0.0003
