<a href="https://colab.research.google.com/github/ResistorCat/recsys-project/blob/feat%2Fnew_model/LightFM_Meal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
FLAG_DATASET_DL = True

## 1. Carga de datos y librerías

In [2]:
# Actualizar pip
%pip install --upgrade pip
# Dependencias para graficar y manipular datos
%pip install pandas matplotlib tqdm seaborn ipywidgets
# Dependencias para leer archivos Parquet
%pip install pyarrow fastparquet
# Para predicción
%pip install scikit-learn lightfm recommenders torch_geometric


Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2
Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.11.0-cp3

In [3]:
# Importar librerías globales

# Manejo de datos y visualización
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm.auto import tqdm
import zipfile


# Crear directorios
PATH_DATASETS = "datasets"
PATH_DATASETS_MEALRECPLUS = os.path.join(PATH_DATASETS, "mealrecplus")
os.makedirs(PATH_DATASETS, exist_ok=True)

In [4]:
PATH_MEALRECPLUS_META_DATA = "/content/datasets/mealrecplus/MealRecPlus-main/MealRec+/MealRec+H/meta_data/"
PATH_MEALRECPLUS_HEALTHINESS = "/content/datasets/mealrecplus/MealRecPlus-main/MealRec+/MealRec+H/healthiness/"

In [5]:
if FLAG_DATASET_DL:
  # Descargar "MealRecPlus" de WUT-IDEA
  os.system(f"curl -L -o {PATH_DATASETS_MEALRECPLUS}.zip https://github.com/WUT-IDEA/MealRecPlus/archive/refs/heads/main.zip")
  os.system(f"unzip -o {PATH_DATASETS_MEALRECPLUS}.zip -d {PATH_DATASETS_MEALRECPLUS}")
  # Eliminar el zip
  os.remove(f"{PATH_DATASETS_MEALRECPLUS}.zip")
  # Cita: Ming Li, Lin Li, Xiaohui Tao, and Jimmy Xiangji Huang. 2024. MealRec+: A Meal Recommendation Dataset with Meal-Course Affiliation for Personal- ization and Healthiness. In Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR ’24), July 14–18, 2024, Washington, DC, USA. ACM, New York, NY, USA, 11 pages. https://doi.org/10.1145/3626772.3657857 (https://github.com/WUT-IDEA/MealRecPlus)
  zip_path_course = PATH_MEALRECPLUS_META_DATA+"course.zip"
  zip_path_user_course = PATH_MEALRECPLUS_META_DATA+"user_course.zip"
  extract_path_course = PATH_MEALRECPLUS_META_DATA

  os.makedirs(extract_path_course, exist_ok=True)

  with zipfile.ZipFile(zip_path_course, 'r') as zip_ref:
      zip_ref.extractall(extract_path_course)

  with zipfile.ZipFile(zip_path_user_course, 'r') as zip_ref:
      zip_ref.extractall(extract_path_course)

  print("✅ Archivos descomprimidos en:", extract_path_course)


✅ Archivos descomprimidos en: /content/datasets/mealrecplus/MealRecPlus-main/MealRec+/MealRec+H/meta_data/


## 2. Preprocesamiento de datos

In [6]:
# Cargar interacciones usuario-plato (ratings)
df_user_course = pd.read_csv(
    PATH_MEALRECPLUS_META_DATA+"user_course.csv",
    names=["user_id", "course_id", "rating", "dateLastModified"],
    header=None
)

# Convertir la columna 'rating' a tipo numérico, forzando los errores a NaN
df_user_course['rating'] = pd.to_numeric(df_user_course['rating'], errors='coerce')
df_user_course.dropna(subset=['rating'], inplace=True)

df_course = pd.read_csv(PATH_MEALRECPLUS_META_DATA+"course.csv")

df_user2index = pd.read_csv(PATH_MEALRECPLUS_META_DATA+"user2index.txt", sep="\t", names=["user_id", "user_index"])
df_course2index = pd.read_csv(PATH_MEALRECPLUS_META_DATA+"course2index.txt", sep="\t", names=["course_id", "course_index"])

  df_user_course = pd.read_csv(


In [7]:
df_user_course.head()

Unnamed: 0,user_id,course_id,rating,dateLastModified
1,39,61727,4.0,2001-03-26T07:36:16.653\n
2,39,7612,5.0,2004-08-02T16:48:37.107\n
3,39,12009,4.0,2001-02-28T10:37:25.59\n
4,39,88185,5.0,2009-05-18T14:55:11.517\n
5,39,24445,5.0,2015-05-07T15:42:20.32\n


# Modelo Base

In [8]:
import numpy as np
import pandas as pd
import pickle
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k


class MealRecPlusRecommender:
    def __init__(self,
                 df_ratings: pd.DataFrame,
                 min_ratings_per_user: int = 5,
                 min_ratings_per_item: int = 5,
                 test_percentage: float = 0.25,
                 no_components: int = 30,
                 learning_rate: float = 0.05,
                 loss: str = 'warp',
                 random_state: int = 42,
                 num_threads: int = 4,
                 verbose: bool = False):
        self.verbose = verbose

        user_counts = df_ratings['user_id'].value_counts()
        item_counts = df_ratings['course_id'].value_counts()
        keep_users  = user_counts[user_counts >= min_ratings_per_user].index
        keep_items  = item_counts[item_counts >= min_ratings_per_item].index
        self.df = df_ratings[
            df_ratings['user_id'].isin(keep_users) &
            df_ratings['course_id'].isin(keep_items)
        ].copy()

        # Dataset
        self.dataset = Dataset()
        self.dataset.fit(
            users=self.df['user_id'].unique(),
            items=self.df['course_id'].unique()
        )

        # Interactions
        interaction_tuples = self.df[['user_id','course_id','rating']].to_numpy()
        self.interactions, _ = self.dataset.build_interactions(interaction_tuples)

        # Split
        self.train, self.test = random_train_test_split(
            self.interactions,
            test_percentage=test_percentage,
            random_state=np.random.RandomState(random_state)
        )

        # Model
        self.model = LightFM(
            loss=loss,
            no_components=no_components,
            learning_rate=learning_rate,
            random_state=np.random.RandomState(random_state)
        )
        self.num_threads = num_threads

    def fit(self, epochs: int = 10):
        """Entrena el modelo WARP sin sample_weight."""
        self.model.fit(
            self.train,
            epochs=epochs,
            num_threads=self.num_threads,
            verbose=self.verbose
        )

    def evaluate(self, k: int = 10) -> dict:
        prec = precision_at_k(self.model, self.test,
                              train_interactions=self.train,
                              k=k, num_threads=self.num_threads).mean()
        rec = recall_at_k(self.model, self.test,
                          train_interactions=self.train,
                          k=k, num_threads=self.num_threads).mean()
        return {f'precision@{k}': prec, f'recall@{k}': rec}

    def recommend(self, user_id, num_items: int = 10) -> pd.DataFrame:
        n_users, n_items = self.dataset.interactions_shape()
        scores = self.model.predict(user_id,
                                    np.arange(n_items),
                                    num_threads=self.num_threads)
        _, _, idx_to_item = self.dataset.mapping()
        inv_map = {v: k for k, v in idx_to_item.items()}
        top_idx = np.argsort(-scores)[:num_items]
        return pd.DataFrame({
            'course_id': [inv_map[i] for i in top_idx],
            'score':     scores[top_idx]
        })

    def save(self, path: str):
        payload = {
            'model': self.model,
            'dataset': self.dataset,
            'train': self.train,
            'test': self.test,
            'num_threads': self.num_threads,
            'verbose': self.verbose,
        }
        with open(path, 'wb') as f:
            pickle.dump(payload, f)
        if self.verbose:
            print(f"[MealRecPlusRec] Guardado en '{path}'")

    @classmethod
    def load(cls, path: str, verbose: bool = False):
        with open(path, 'rb') as f:
            payload = pickle.load(f)
        rec = cls.__new__(cls)
        rec.model        = payload['model']
        rec.dataset      = payload['dataset']
        rec.train        = payload['train']
        rec.test         = payload['test']
        rec.num_threads  = payload['num_threads']
        rec.verbose      = verbose
        if verbose:
            print(f"[MealRecPlusRec] Cargado desde '{path}'")
        return rec


In [9]:
mrp = MealRecPlusRecommender(df_user_course,
                             min_ratings_per_user=10,
                             min_ratings_per_item=10,
                             test_percentage=0.2,
                             verbose=True)
mrp.fit(epochs=20)
print(mrp.evaluate(k=10))


Epoch: 100%|██████████| 20/20 [00:03<00:00,  6.44it/s]


{'precision@10': np.float32(0.09542857), 'recall@10': np.float64(0.07462292327246883)}


# Modelo con metadatos basado en salud

In [10]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k

df_meta_course = pd.read_csv(
    PATH_MEALRECPLUS_META_DATA+"course.csv",
    names=["course_id","course_name"]
)

user_fsa = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + "user_fsa.txt",
                       header=None, names=["fsa_score"])
user_who = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + "user_who.txt",
                       header=None, names=["who_score"])
course_fsa = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + "course_fsa.txt",
                         header=None, names=["fsa_score"])
course_who = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + "course_who.txt",
                         header=None, names=["who_score"])

user_fsa["user_id"] = np.arange(len(user_fsa))
user_who["user_id"] = np.arange(len(user_who))
N = len(course_fsa)
df_scores = df_meta_course.iloc[:N][["course_id"]].copy()
df_scores["fsa_score"] = course_fsa["fsa_score"].values
df_scores["who_score"] = course_who["who_score"].values

df_meta_scored = df_meta_course.merge(
    df_scores, on="course_id", how="inner"
)
df_user_scores = pd.merge(user_fsa, user_who, on="user_id")

# bucketizacion
def bucket_fsa(x):
    if   x >= 7: return "u_fsa_high"
    elif x >= 4: return "u_fsa_mid"
    else:        return "u_fsa_low"

def bucket_who(x):
    if   x >= 7: return "u_who_high"
    elif x >= 4: return "u_who_mid"
    else:        return "u_who_low"

df_user_scores["u_fsa_bucket"] = df_user_scores["fsa_score"].apply(bucket_fsa)
df_user_scores["u_who_bucket"] = df_user_scores["who_score"].apply(bucket_who)

item_features_map = {
    row.course_id: [
        f"fsa:{bucket_fsa(row.fsa_score)}",
        f"who:{bucket_who(row.who_score)}"
    ]
    for row in df_meta_scored.itertuples()
}

user_feat_tuples = [
    (row.user_id, [row.u_fsa_bucket, row.u_who_bucket])
    for row in df_user_scores.itertuples()
]


all_users = df_user_course["user_id"].unique()
all_items = df_user_course["course_id"].unique()


all_user_feats = {feat for _, feats in user_feat_tuples for feat in feats}

dataset = Dataset()
dataset.fit(
    users=all_users,
    items=all_items,
    user_features=list(all_user_feats),

)


interactions, _ = dataset.build_interactions(
    df_user_course[["user_id","course_id","rating"]].to_numpy()
)
train, test = random_train_test_split(interactions,
                                      test_percentage=0.25,
                                      random_state=42)


valid_users = set(df_user_course["user_id"].unique())
user_feat_tuples = [
    (uid, feats)
    for uid, feats in user_feat_tuples
    if uid in valid_users
]

user_features = dataset.build_user_features(user_feat_tuples)

item_feat_tuples = (
    (item, item_features_map.get(item, []))
    for item in all_items
)
item_features = dataset.build_item_features(item_feat_tuples)

model = LightFM(loss="warp",
                no_components=50,
                learning_rate=0.02,
                item_alpha=1e-4,
                user_alpha=1e-4,
                random_state=42)

model.fit(
    train,
    user_features=user_features,
    item_features=item_features,
    epochs=40,
    num_threads=4,
    verbose=True
)

prec = precision_at_k(model, test,
                      train_interactions=train,
                      user_features=user_features,
                      item_features=item_features,
                      k=10, num_threads=4).mean()

rec = recall_at_k(model, test,
                  train_interactions=train,
                  user_features=user_features,
                  item_features=item_features,
                  k=10, num_threads=4).mean()

print(f"precision@10: {prec:.4f}, recall@10: {rec:.4f}")


Epoch: 100%|██████████| 40/40 [00:12<00:00,  3.20it/s]


precision@10: 0.1353, recall@10: 0.0744


## Mejoras al modelo base


In [14]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k
from sklearn.model_selection import ParameterGrid

def create_enhanced_buckets(score, score_type='fsa'):
    if score_type == 'fsa':
        if score >= 13: return f"{score_type}_very_high"
        elif score >= 11: return f"{score_type}_high"
        elif score >= 7: return f"{score_type}_mid_high"
        elif score >= 4: return f"{score_type}_mid"
        elif score >= 1: return f"{score_type}_low"
        else: return f"{score_type}_very_low"
    else:  # WHO
        if score >= 4: return f"{score_type}_high"
        elif score >= 3: return f"{score_type}_mid_high"
        elif score >= 2: return f"{score_type}_mid"
        elif score >= 1: return f"{score_type}_low"
        else: return f"{score_type}_very_low"

df_user_scores["u_fsa_bucket"] = df_user_scores["fsa_score"].apply(
    lambda x: create_enhanced_buckets(x, 'fsa')
)
df_user_scores["u_who_bucket"] = df_user_scores["who_score"].apply(
    lambda x: create_enhanced_buckets(x, 'who')
)

df_user_scores["u_health_profile"] = (
    df_user_scores["u_fsa_bucket"] + "_" + df_user_scores["u_who_bucket"]
)


item_features_map = {}
for row in df_meta_scored.itertuples():
    fsa_bucket = create_enhanced_buckets(row.fsa_score, 'fsa')
    who_bucket = create_enhanced_buckets(row.who_score, 'who')

    item_features_map[row.course_id] = [
        fsa_bucket,
        who_bucket,
        f"{fsa_bucket}_{who_bucket}",
        f"fsa_score_{int(row.fsa_score)}",
        f"who_score_{int(row.who_score)}"
    ]

user_feat_tuples = []
for row in df_user_scores.itertuples():
    features = [
        row.u_fsa_bucket,
        row.u_who_bucket,
        row.u_health_profile,
        f"fsa_score_{int(row.fsa_score)}",
        f"who_score_{int(row.who_score)}"
    ]
    user_feat_tuples.append((row.user_id, features))

all_users = df_user_course["user_id"].unique()
all_items = df_user_course["course_id"].unique()


all_user_feats = {feat for _, feats in user_feat_tuples for feat in feats}
all_item_feats = {feat for feats in item_features_map.values() for feat in feats}

dataset = Dataset()
dataset.fit(
    users=all_users,
    items=all_items,
    user_features=list(all_user_feats),
    item_features=list(all_item_feats)
)

interactions, _ = dataset.build_interactions(
    df_user_course[["user_id","course_id","rating"]].to_numpy()
)
train, test = random_train_test_split(interactions,
                                      test_percentage=0.25,
                                      random_state=42)
# usuarios validos
valid_users = set(df_user_course["user_id"].unique())
user_feat_tuples = [
    (uid, feats)
    for uid, feats in user_feat_tuples
    if uid in valid_users
]

user_features = dataset.build_user_features(user_feat_tuples)

item_feat_tuples = [
    (item, item_features_map.get(item, []))
    for item in all_items
]
item_features = dataset.build_item_features(item_feat_tuples)

def optimize_hyperparameters(train, test, user_features, item_features):

    param_grid = {
        'no_components': [64, 100, 128],
        'learning_rate': [0.01, 0.05, 0.1],
        'item_alpha': [1e-6, 1e-5, 1e-4],
        'user_alpha': [1e-6, 1e-5, 1e-4],
        'loss': ['warp', 'bpr']
    }

    best_precision = 0
    best_params = None

    for params in ParameterGrid(param_grid):
        model = LightFM(
            loss=params['loss'],
            no_components=params['no_components'],
            learning_rate=params['learning_rate'],
            item_alpha=params['item_alpha'],
            user_alpha=params['user_alpha'],
            random_state=42
        )

        model.fit(
            train,
            user_features=user_features,
            item_features=item_features,
            epochs=30,
            num_threads=4,
            verbose=False
        )

        precision = precision_at_k(
            model, test,
            train_interactions=train,
            user_features=user_features,
            item_features=item_features,
            k=10, num_threads=4
        ).mean()

        if precision > best_precision:
            best_precision = precision
            best_params = params

        print(f"Params: {params}, Precision@10: {precision:.4f}")

    return best_params, best_precision


best_params, best_precision = optimize_hyperparameters(train, test, user_features, item_features)
print(f"Best parameters: {best_params}")
print(f"Best precision: {best_precision:.4f}")


final_model = LightFM(
    loss=best_params['loss'],
    no_components=best_params['no_components'],
    learning_rate=best_params['learning_rate'],
    item_alpha=best_params['item_alpha'],
    user_alpha=best_params['user_alpha'],
    random_state=42
)

final_model.fit(
    train,
    user_features=user_features,
    item_features=item_features,
    epochs=50,
    num_threads=4,
    verbose=True
)


final_precision = precision_at_k(
    final_model, test,
    train_interactions=train,
    user_features=user_features,
    item_features=item_features,
    k=10, num_threads=4
).mean()

final_recall = recall_at_k(
    final_model, test,
    train_interactions=train,
    user_features=user_features,
    item_features=item_features,
    k=10, num_threads=4
).mean()

print(f"Final precision@10: {final_precision:.4f}, recall@10: {final_recall:.4f}")

Params: {'item_alpha': 1e-06, 'learning_rate': 0.01, 'loss': 'warp', 'no_components': 64, 'user_alpha': 1e-06}, Precision@10: 0.1323
Params: {'item_alpha': 1e-06, 'learning_rate': 0.01, 'loss': 'warp', 'no_components': 64, 'user_alpha': 1e-05}, Precision@10: 0.1329
Params: {'item_alpha': 1e-06, 'learning_rate': 0.01, 'loss': 'warp', 'no_components': 64, 'user_alpha': 0.0001}, Precision@10: 0.1294
Params: {'item_alpha': 1e-06, 'learning_rate': 0.01, 'loss': 'warp', 'no_components': 100, 'user_alpha': 1e-06}, Precision@10: 0.1324
Params: {'item_alpha': 1e-06, 'learning_rate': 0.01, 'loss': 'warp', 'no_components': 100, 'user_alpha': 1e-05}, Precision@10: 0.1359
Params: {'item_alpha': 1e-06, 'learning_rate': 0.01, 'loss': 'warp', 'no_components': 100, 'user_alpha': 0.0001}, Precision@10: 0.1320
Params: {'item_alpha': 1e-06, 'learning_rate': 0.01, 'loss': 'warp', 'no_components': 128, 'user_alpha': 1e-06}, Precision@10: 0.1375
Params: {'item_alpha': 1e-06, 'learning_rate': 0.01, 'loss': 'w

Epoch: 100%|██████████| 50/50 [00:35<00:00,  1.42it/s]


Final precision@10: 0.1372, recall@10: 0.0749


## Mejor hiperparametros obtenidos

In [15]:
final_model = LightFM(
    loss='warp',
    no_components=128,
    learning_rate=0.01,
    item_alpha=1e-06,
    user_alpha=0.0001,
    random_state=42
)

final_model.fit(
    train,
    user_features=user_features,
    item_features=item_features,
    epochs=50,
    num_threads=4,
    verbose=True
)
final_precision = precision_at_k(
    final_model, test,
    train_interactions=train,
    user_features=user_features,
    item_features=item_features,
    k=10, num_threads=4
).mean()

final_recall = recall_at_k(
    final_model, test,
    train_interactions=train,
    user_features=user_features,
    item_features=item_features,
    k=10, num_threads=4
).mean()

print(f"Final precision@10: {final_precision:.4f}, recall@10: {final_recall:.4f}")

Epoch: 100%|██████████| 50/50 [00:33<00:00,  1.48it/s]


Final precision@10: 0.1388, recall@10: 0.0759


# Mejor bucketizacion

In [12]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler


df_course = pd.read_csv(
    PATH_MEALRECPLUS_META_DATA + 'course.csv',
    header=0,
    names=[
        'course_id', 'course_name', 'review_nums', 'category',
        'aver_rate', 'image_url', 'ingredients', 'cooking_directions',
        'nutritions', 'reviews', 'tags'
    ]
)
df_course[['review_nums', 'aver_rate']] = df_course[['review_nums', 'aver_rate']].apply(pd.to_numeric, errors='coerce')

# One-hot de categoría y sanitización de nombres
cat_df = pd.get_dummies(df_course['category'], prefix='cat')
cat_df.columns = [col.replace('-', '_').replace(' ', '_') for col in cat_df.columns]
df_course = pd.concat([df_course, cat_df], axis=1)

# 2. Parseo y normalización de nutrición
nutri_df = df_course['nutritions'].str.split(';', expand=True)
nutri_expanded = {}
for col in nutri_df:
    pair = nutri_df[col].str.split(':', expand=True)
    if pair.shape[1] == 2:
        key = pair[0].iloc[0]
        nutri_expanded[key] = pair[1].astype(float)
if nutri_expanded:
    df_nutri = pd.DataFrame(nutri_expanded)
    df_nutri[df_nutri.columns] = MinMaxScaler().fit_transform(df_nutri)
    df_course = pd.concat([df_course, df_nutri], axis=1)
else:
    df_nutri = pd.DataFrame()

# 3. Embeddings TF-IDF + SVD de ingredientes + tags
corpus = df_course['ingredients'].fillna('') + ' ' + df_course['tags'].fillna('')
tf_matrix = TfidfVectorizer(max_features=1000, stop_words='english').fit_transform(corpus)
svd = TruncatedSVD(n_components=50, random_state=42)
tf_emb = svd.fit_transform(tf_matrix)

# 4. Healthiness (FSA & WHO)
course_fsa = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + 'course_fsa.txt', header=None, names=['fsa_score'])
course_who = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + 'course_who.txt', header=None, names=['who_score'])
user_fsa   = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + 'user_fsa.txt',   header=None, names=['fsa_score'])
user_who   = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + 'user_who.txt',   header=None, names=['who_score'])

# Función buckets
create_buckets = lambda score, t: (
    f"{t}_very_high" if t=='fsa' and score>=13 else
    f"{t}_high"       if t=='fsa' and score>=11 else
    f"{t}_mid_high"   if t=='fsa' and score>=7  else
    f"{t}_mid"        if t=='fsa' and score>=4  else
    f"{t}_low"        if t=='fsa' and score>=1  else
    f"{t}_very_low"   if t=='fsa' else
    f"{t}_high"       if t=='who' and score>=4  else
    f"{t}_mid_high"   if t=='who' and score>=3  else
    f"{t}_mid"        if t=='who' and score>=2  else
    f"{t}_low"        if t=='who' and score>=1  else
    f"{t}_very_low"
)

# Merge healthiness a df_course
df_course = df_course.reset_index().rename(columns={'index':'course_idx'})
df_course = df_course.merge(course_fsa, left_on='course_idx', right_index=True)
df_course = df_course.merge(course_who, left_on='course_idx', right_index=True)
df_course['fsa_bucket'] = df_course['fsa_score'].apply(lambda x: create_buckets(x, 'fsa'))
df_course['who_bucket'] = df_course['who_score'].apply(lambda x: create_buckets(x, 'who'))

# 5. Construcción item_features_map
item_features_map = {}
for row in df_course.itertuples(index=False):
    d = row._asdict()
    feats = [f"revnum_{int(d['review_nums'])}", f"avgrate_{round(d['aver_rate'])}"]
    feats += [c for c in cat_df.columns if d.get(c,0)==1]
    feats += [f"nutri_{nut}_{d.get(nut,0):.3f}" for nut in df_nutri.columns]
    feats += [d['fsa_bucket'], d['who_bucket']]
    feats += [f"ing_svd_{i}_{val:.3f}" for i,val in enumerate(tf_emb[d['course_idx']])]
    item_features_map[d['course_id']] = feats

# 6. Construcción user_features_map (filtrando solo usuarios con interacciones)
df_inter = pd.read_csv(PATH_MEALRECPLUS_META_DATA + 'user_course.csv')
valid_users = set(df_inter['user_id'])

df_us = pd.merge(user_fsa.assign(user_id=user_fsa.index), user_who.assign(user_id=user_who.index), on='user_id')
df_us['fsa_bucket'] = df_us['fsa_score'].apply(lambda x: create_buckets(x,'fsa'))
df_us['who_bucket'] = df_us['who_score'].apply(lambda x: create_buckets(x,'who'))
user_features_map = []
for row in df_us.itertuples(index=False):
    uid, fs, ws = row.user_id, row.fsa_bucket, row.who_bucket
    if uid in valid_users:
        user_features_map.append((uid, [fs, ws, f"fsa_score_{int(row.fsa_score)}", f"who_score_{int(row.who_score)}"]))

# 7. Dataset y entrenamiento
all_users = df_inter['user_id'].unique()
all_items = df_inter['course_id'].unique()

dataset = Dataset()
dataset.fit(
    users=all_users,
    items=all_items,
    user_features=[f for _,fs in user_features_map for f in fs],
    item_features=[f for fs in item_features_map.values() for f in fs]
)
interactions, _ = dataset.build_interactions(df_inter[['user_id','course_id','rating']].values)
train, test = random_train_test_split(interactions, test_percentage=0.25, random_state=42)

user_features = dataset.build_user_features(user_features_map)
item_features = dataset.build_item_features(list(item_features_map.items()))

model = LightFM(loss='warp', no_components=128, learning_rate=0.01, item_alpha=1e-6, user_alpha=1e-4, random_state=42)
model.fit(train, user_features=user_features, item_features=item_features, epochs=50, num_threads=4, verbose=True)

# 8. Evaluación
print(f"Precision@10: {precision_at_k(model, test, train_interactions=train, user_features=user_features, item_features=item_features, k=10).mean():.4f}")
print(f"Recall@10:    {recall_at_k(model, test, train_interactions=train, user_features=user_features, item_features=item_features, k=10).mean():.4f}")


Epoch: 100%|██████████| 50/50 [17:21<00:00, 20.83s/it]


Precision@10: 0.1403
Recall@10:    0.0760


In [13]:
def recommend_by_tag(preferred_tags, df_course=df_course, top_n=10):
    # Lista de sets de tags por curso
    course_tags = df_course['tags'].fillna('').str.lower().str.split(';')
    course_tags = course_tags.apply(lambda lst: set([t.strip() for t in lst if t]))
    user_set = set([t.lower() for t in preferred_tags])
    # Calcular similitud Jaccard
    jaccard = course_tags.apply(lambda tags: len(tags & user_set) / len(tags | user_set) if tags or user_set else 0)
    # Seleccionar top_n
    idx = jaccard.sort_values(ascending=False).head(top_n).index
    recs = df_course.iloc[idx][['course_id','course_name','tags']].copy()
    recs['jaccard_score'] = jaccard.iloc[idx].values
    return recs.reset_index(drop=True)

# 10. Recomendar híbrido: tag + modelo
def recommend_hybrid(preferred_tags, model, df_course=df_course,
                     dataset=dataset, top_n=10, alpha=0.3):
    # Similitud por tag
    course_tags = df_course['tags'].fillna('').str.lower().str.split(';')
    course_tags = course_tags.apply(lambda lst: set([t.strip() for t in lst if t]))
    user_set = set([t.lower() for t in preferred_tags])
    sim_tag = course_tags.apply(lambda tags: len(tags & user_set) / len(tags | user_set) if tags or user_set else 0).values

    # Mapeo id->idx interno para embeddings
    item_map = dataset._item_id_mapping
    # Similitud modelo
    user_emb = model.user_embeddings.mean(axis=0)
    sim_model = np.zeros(len(df_course))
    for i, cid in enumerate(df_course['course_id']):
        idx = item_map.get(cid)
        if idx is not None:
            sim_model[i] = np.dot(model.item_embeddings[idx], user_emb)
    # Normalizar sim_model
    if sim_model.max() > sim_model.min():
        sim_model = (sim_model - sim_model.min()) / (sim_model.max() - sim_model.min())

    # Filtrado: priorizar sólo cursos que contengan algún tag
    mask = sim_tag > 0
    # Hybrid score
    hybrid = alpha * sim_tag + (1 - alpha) * sim_model
    # Índices ordenados por hybrid en los que mask True
    tagged_idxs = np.where(mask)[0]
    tagged_scores = hybrid[tagged_idxs]
    # Ordenar tagged
    sorted_tagged = tagged_idxs[np.argsort(-tagged_scores)]
    # Si no llegan a top_n, complementar con mejores de sim_model
    if len(sorted_tagged) >= top_n:
        final_idxs = sorted_tagged[:top_n]
    else:
        # escoger top_n-len(sorted) de sim_model donde no mask
        remaining = np.setdiff1d(np.argsort(-sim_model), tagged_idxs)
        needed = top_n - len(sorted_tagged)
        final_idxs = np.concatenate([sorted_tagged, remaining[:needed]])

    recs = df_course.iloc[final_idxs][['course_id','course_name','tags']].copy()
    recs['hybrid_score'] = hybrid[final_idxs]
    recs['tag_score'] = sim_tag[final_idxs]
    recs['model_score'] = sim_model[final_idxs]
    return recs.reset_index(drop=True)

tags = ['low-fat']
recs = recommend_hybrid(tags, model, df_course, top_n=10, alpha=0.7)
print(recs)



   course_id                        course_name  \
0       6776                      Pizza Dough I   
1       9111                    Cranberry Sauce   
2      14469    Jamie's Cranberry Spinach Salad   
3      25209             Oat Applesauce Muffins   
4      89539  Slow-Cooker Chicken Tortilla Soup   
5     141678            Slow Cooker Pulled Pork   
6      24272              Buttery Soft Pretzels   
7      21126            Homemade Tomato Sauce I   
8      13933       Black Bean and Corn Salad II   
9      60492             Apple Butter Pork Loin   

                                                tags  hybrid_score  tag_score  \
0  15-minutes-or-less;healthy;5-ingredients-or-le...      0.332267   0.071429   
1  30-minutes-or-less;low-protein;healthy;condime...      0.322475   0.058824   
2  15-minutes-or-less;low-protein;healthy;salads;...      0.321743   0.076923   
3  60-minutes-or-less;healthy;low-fat;dietary;low...      0.294502   0.166667   
4  healthy;main-dish;soups-stews;