<a href="https://colab.research.google.com/github/ResistorCat/recsys-project/blob/feat%2Fnew_model/model/LightFM_Meal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
FLAG_DATASET_DL = True

## 1. Carga de datos y librerías

In [None]:
# Actualizar pip
%pip install --upgrade pip
# Dependencias para graficar y manipular datos
%pip install pandas matplotlib tqdm seaborn ipywidgets
# Dependencias para leer archivos Parquet
%pip install pyarrow fastparquet
# Para predicción
%pip install scikit-learn lightfm recommenders

In [None]:
# Importar librerías globales

# Manejo de datos y visualización
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm.auto import tqdm
import zipfile


# Crear directorios
PATH_DATASETS = "datasets"
PATH_DATASETS_MEALRECPLUS = os.path.join(PATH_DATASETS, "mealrecplus")
os.makedirs(PATH_DATASETS, exist_ok=True)

In [None]:
PATH_MEALRECPLUS_META_DATA = "/content/datasets/mealrecplus/MealRecPlus-main/MealRec+/MealRec+H/meta_data/"
PATH_MEALRECPLUS_HEALTHINESS = "/content/datasets/mealrecplus/MealRecPlus-main/MealRec+/MealRec+H/healthiness/"

In [None]:
if FLAG_DATASET_DL:
  # Descargar "MealRecPlus" de WUT-IDEA
  os.system(f"curl -L -o {PATH_DATASETS_MEALRECPLUS}.zip https://github.com/WUT-IDEA/MealRecPlus/archive/refs/heads/main.zip")
  os.system(f"unzip -o {PATH_DATASETS_MEALRECPLUS}.zip -d {PATH_DATASETS_MEALRECPLUS}")
  # Eliminar el zip
  os.remove(f"{PATH_DATASETS_MEALRECPLUS}.zip")
  # Cita: Ming Li, Lin Li, Xiaohui Tao, and Jimmy Xiangji Huang. 2024. MealRec+: A Meal Recommendation Dataset with Meal-Course Affiliation for Personal- ization and Healthiness. In Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR ’24), July 14–18, 2024, Washington, DC, USA. ACM, New York, NY, USA, 11 pages. https://doi.org/10.1145/3626772.3657857 (https://github.com/WUT-IDEA/MealRecPlus)
  zip_path_course = PATH_MEALRECPLUS_META_DATA+"course.zip"
  zip_path_user_course = PATH_MEALRECPLUS_META_DATA+"user_course.zip"
  extract_path_course = PATH_MEALRECPLUS_META_DATA

  os.makedirs(extract_path_course, exist_ok=True)

  with zipfile.ZipFile(zip_path_course, 'r') as zip_ref:
      zip_ref.extractall(extract_path_course)

  with zipfile.ZipFile(zip_path_user_course, 'r') as zip_ref:
      zip_ref.extractall(extract_path_course)

  print("✅ Archivos descomprimidos en:", extract_path_course)


## 2. Preprocesamiento de datos

In [None]:
# Cargar interacciones usuario-plato (ratings)
df_user_course = pd.read_csv(
    PATH_MEALRECPLUS_META_DATA+"user_course.csv",
    names=["user_id", "course_id", "rating", "dateLastModified"],
    header=None
)

# Convertir la columna 'rating' a tipo numérico, forzando los errores a NaN
df_user_course['rating'] = pd.to_numeric(df_user_course['rating'], errors='coerce')
df_user_course.dropna(subset=['rating'], inplace=True)

df_course = pd.read_csv(PATH_MEALRECPLUS_META_DATA+"course.csv")

df_user2index = pd.read_csv(PATH_MEALRECPLUS_META_DATA+"user2index.txt", sep="\t", names=["user_id", "user_index"])
df_course2index = pd.read_csv(PATH_MEALRECPLUS_META_DATA+"course2index.txt", sep="\t", names=["course_id", "course_index"])

In [None]:
df_user_course.head()

# Modelo Base

In [None]:
import numpy as np
import pandas as pd
import pickle
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k


class MealRecPlusRecommender:
    def __init__(self,
                 df_ratings: pd.DataFrame,
                 min_ratings_per_user: int = 5,
                 min_ratings_per_item: int = 5,
                 test_percentage: float = 0.25,
                 no_components: int = 30,
                 learning_rate: float = 0.05,
                 loss: str = 'warp',
                 random_state: int = 42,
                 num_threads: int = 4,
                 verbose: bool = False):
        self.verbose = verbose

        # Filtrado
        user_counts = df_ratings['user_id'].value_counts()
        item_counts = df_ratings['course_id'].value_counts()
        keep_users  = user_counts[user_counts >= min_ratings_per_user].index
        keep_items  = item_counts[item_counts >= min_ratings_per_item].index
        self.df = df_ratings[
            df_ratings['user_id'].isin(keep_users) &
            df_ratings['course_id'].isin(keep_items)
        ].copy()

        # Dataset
        self.dataset = Dataset()
        self.dataset.fit(
            users=self.df['user_id'].unique(),
            items=self.df['course_id'].unique()
        )

        # Interactions
        interaction_tuples = self.df[['user_id','course_id','rating']].to_numpy()
        self.interactions, _ = self.dataset.build_interactions(interaction_tuples)

        # Split
        self.train, self.test = random_train_test_split(
            self.interactions,
            test_percentage=test_percentage,
            random_state=np.random.RandomState(random_state)
        )

        # Modelo
        self.model = LightFM(
            loss=loss,
            no_components=no_components,
            learning_rate=learning_rate,
            random_state=np.random.RandomState(random_state)
        )
        self.num_threads = num_threads

    def fit(self, epochs: int = 10):
        """Entrena el modelo WARP sin sample_weight."""
        self.model.fit(
            self.train,
            epochs=epochs,
            num_threads=self.num_threads,
            verbose=self.verbose
        )

    def evaluate(self, k: int = 10) -> dict:
        prec = precision_at_k(self.model, self.test,
                              train_interactions=self.train,
                              k=k, num_threads=self.num_threads).mean()
        rec = recall_at_k(self.model, self.test,
                          train_interactions=self.train,
                          k=k, num_threads=self.num_threads).mean()
        return {f'precision@{k}': prec, f'recall@{k}': rec}

    def recommend(self, user_id, num_items: int = 10) -> pd.DataFrame:
        n_users, n_items = self.dataset.interactions_shape()
        scores = self.model.predict(user_id,
                                    np.arange(n_items),
                                    num_threads=self.num_threads)
        _, _, idx_to_item = self.dataset.mapping()
        inv_map = {v: k for k, v in idx_to_item.items()}
        top_idx = np.argsort(-scores)[:num_items]
        return pd.DataFrame({
            'course_id': [inv_map[i] for i in top_idx],
            'score':     scores[top_idx]
        })

    def save(self, path: str):
        payload = {
            'model': self.model,
            'dataset': self.dataset,
            'train': self.train,
            'test': self.test,
            'num_threads': self.num_threads,
            'verbose': self.verbose,
        }
        with open(path, 'wb') as f:
            pickle.dump(payload, f)
        if self.verbose:
            print(f"[MealRecPlusRec] Guardado en '{path}'")

    @classmethod
    def load(cls, path: str, verbose: bool = False):
        with open(path, 'rb') as f:
            payload = pickle.load(f)
        rec = cls.__new__(cls)
        rec.model        = payload['model']
        rec.dataset      = payload['dataset']
        rec.train        = payload['train']
        rec.test         = payload['test']
        rec.num_threads  = payload['num_threads']
        rec.verbose      = verbose
        if verbose:
            print(f"[MealRecPlusRec] Cargado desde '{path}'")
        return rec


In [None]:
mrp = MealRecPlusRecommender(df_user_course,
                             min_ratings_per_user=10,
                             min_ratings_per_item=10,
                             test_percentage=0.2,
                             verbose=True)
mrp.fit(epochs=20)
print(mrp.evaluate(k=10))


# Modelo con metadatos basado en salud

In [15]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k

df_meta_course = pd.read_csv(
    PATH_MEALRECPLUS_META_DATA+"course.csv",
    names=["course_id","course_name"]
)

user_fsa = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + "user_fsa.txt",
                       header=None, names=["fsa_score"])
user_who = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + "user_who.txt",
                       header=None, names=["who_score"])
course_fsa = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + "course_fsa.txt",
                         header=None, names=["fsa_score"])
course_who = pd.read_csv(PATH_MEALRECPLUS_HEALTHINESS + "course_who.txt",
                         header=None, names=["who_score"])

user_fsa["user_id"] = np.arange(len(user_fsa))
user_who["user_id"] = np.arange(len(user_who))
N = len(course_fsa)
df_scores = df_meta_course.iloc[:N][["course_id"]].copy()
df_scores["fsa_score"] = course_fsa["fsa_score"].values
df_scores["who_score"] = course_who["who_score"].values

df_meta_scored = df_meta_course.merge(
    df_scores, on="course_id", how="inner"
)
df_user_scores = pd.merge(user_fsa, user_who, on="user_id")

# ————————————————————————————————————————————————————————————————
# 2) Bucketización de los promedios de usuario
# ————————————————————————————————————————————————————————————————
def bucket_fsa(x):
    if   x >= 7: return "u_fsa_high"
    elif x >= 4: return "u_fsa_mid"
    else:        return "u_fsa_low"

def bucket_who(x):
    if   x >= 7: return "u_who_high"
    elif x >= 4: return "u_who_mid"
    else:        return "u_who_low"

df_user_scores["u_fsa_bucket"] = df_user_scores["fsa_score"].apply(bucket_fsa)
df_user_scores["u_who_bucket"] = df_user_scores["who_score"].apply(bucket_who)

item_features_map = {
    row.course_id: [
        f"fsa:{bucket_fsa(row.fsa_score)}",
        f"who:{bucket_who(row.who_score)}"
    ]
    for row in df_meta_scored.itertuples()
}

user_feat_tuples = [
    (row.user_id, [row.u_fsa_bucket, row.u_who_bucket])
    for row in df_user_scores.itertuples()
]


all_users = df_user_course["user_id"].unique()
all_items = df_user_course["course_id"].unique()

# Extraigo la lista de user_features únicas
all_user_feats = {feat for _, feats in user_feat_tuples for feat in feats}

dataset = Dataset()
dataset.fit(
    users=all_users,
    items=all_items,
    user_features=list(all_user_feats),

)


interactions, _ = dataset.build_interactions(
    df_user_course[["user_id","course_id","rating"]].to_numpy()
)
train, test = random_train_test_split(interactions,
                                      test_percentage=0.25,
                                      random_state=42)

# FILTRAR user_feat_tuples para que solo queden IDs válidos:
valid_users = set(df_user_course["user_id"].unique())
user_feat_tuples = [
    (uid, feats)
    for uid, feats in user_feat_tuples
    if uid in valid_users
]

user_features = dataset.build_user_features(user_feat_tuples)

item_feat_tuples = (
    (item, item_features_map.get(item, []))
    for item in all_items
)
item_features = dataset.build_item_features(item_feat_tuples)

model = LightFM(loss="warp",
                no_components=50,
                learning_rate=0.02,
                item_alpha=1e-4,
                user_alpha=1e-4,
                random_state=42)

model.fit(
    train,
    user_features=user_features,
    item_features=item_features,
    epochs=40,
    num_threads=4,
    verbose=True
)

prec = precision_at_k(model, test,
                      train_interactions=train,
                      user_features=user_features,
                      item_features=item_features,
                      k=10, num_threads=4).mean()

rec = recall_at_k(model, test,
                  train_interactions=train,
                  user_features=user_features,
                  item_features=item_features,
                  k=10, num_threads=4).mean()

print(f"precision@10: {prec:.4f}, recall@10: {rec:.4f}")


Epoch: 100%|██████████| 40/40 [00:12<00:00,  3.19it/s]


precision@10: 0.1351, recall@10: 0.0731


In [12]:
!pip install torch_geometric



## Mejoras al modelo base


In [16]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k
from sklearn.model_selection import ParameterGrid

# Enhanced feature engineering
def create_enhanced_buckets(score, score_type='fsa'):
    """Create more granular buckets for better feature representation"""
    if score_type == 'fsa':
        if score >= 13: return f"{score_type}_very_high"
        elif score >= 11: return f"{score_type}_high"
        elif score >= 7: return f"{score_type}_mid_high"
        elif score >= 4: return f"{score_type}_mid"
        elif score >= 1: return f"{score_type}_low"
        else: return f"{score_type}_very_low"
    else:  # WHO
        if score >= 4: return f"{score_type}_high"
        elif score >= 3: return f"{score_type}_mid_high"
        elif score >= 2: return f"{score_type}_mid"
        elif score >= 1: return f"{score_type}_low"
        else: return f"{score_type}_very_low"

# Apply enhanced bucketing
df_user_scores["u_fsa_bucket"] = df_user_scores["fsa_score"].apply(
    lambda x: create_enhanced_buckets(x, 'fsa')
)
df_user_scores["u_who_bucket"] = df_user_scores["who_score"].apply(
    lambda x: create_enhanced_buckets(x, 'who')
)

# Create combined features for better representation
df_user_scores["u_health_profile"] = (
    df_user_scores["u_fsa_bucket"] + "_" + df_user_scores["u_who_bucket"]
)

# Enhanced item features
item_features_map = {}
for row in df_meta_scored.itertuples():
    fsa_bucket = create_enhanced_buckets(row.fsa_score, 'fsa')
    who_bucket = create_enhanced_buckets(row.who_score, 'who')

    item_features_map[row.course_id] = [
        fsa_bucket,
        who_bucket,
        f"{fsa_bucket}_{who_bucket}",  # Combined feature
        f"fsa_score_{int(row.fsa_score)}",  # Exact score as feature
        f"who_score_{int(row.who_score)}"   # Exact score as feature
    ]

# Enhanced user features
user_feat_tuples = []
for row in df_user_scores.itertuples():
    features = [
        row.u_fsa_bucket,
        row.u_who_bucket,
        row.u_health_profile,
        f"fsa_score_{int(row.fsa_score)}",
        f"who_score_{int(row.who_score)}"
    ]
    user_feat_tuples.append((row.user_id, features))

all_users = df_user_course["user_id"].unique()
all_items = df_user_course["course_id"].unique()

# Extract all unique features
all_user_feats = {feat for _, feats in user_feat_tuples for feat in feats}
all_item_feats = {feat for feats in item_features_map.values() for feat in feats}

dataset = Dataset()
dataset.fit(
    users=all_users,
    items=all_items,
    user_features=list(all_user_feats),
    item_features=list(all_item_feats)
)

interactions, _ = dataset.build_interactions(
    df_user_course[["user_id","course_id","rating"]].to_numpy()
)
train, test = random_train_test_split(interactions,
                                      test_percentage=0.25,
                                      random_state=42)

# Filter valid users
valid_users = set(df_user_course["user_id"].unique())
user_feat_tuples = [
    (uid, feats)
    for uid, feats in user_feat_tuples
    if uid in valid_users
]

user_features = dataset.build_user_features(user_feat_tuples)

item_feat_tuples = [
    (item, item_features_map.get(item, []))
    for item in all_items
]
item_features = dataset.build_item_features(item_feat_tuples)

def optimize_hyperparameters(train, test, user_features, item_features):
    """
    Grid search for optimal hyperparameters
    """
    param_grid = {
        'no_components': [64, 100, 128],
        'learning_rate': [0.01, 0.05, 0.1],
        'item_alpha': [1e-6, 1e-5, 1e-4],
        'user_alpha': [1e-6, 1e-5, 1e-4],
        'loss': ['warp', 'bpr']
    }

    best_precision = 0
    best_params = None

    for params in ParameterGrid(param_grid):
        model = LightFM(
            loss=params['loss'],
            no_components=params['no_components'],
            learning_rate=params['learning_rate'],
            item_alpha=params['item_alpha'],
            user_alpha=params['user_alpha'],
            random_state=42
        )

        model.fit(
            train,
            user_features=user_features,
            item_features=item_features,
            epochs=30,
            num_threads=4,
            verbose=False
        )

        precision = precision_at_k(
            model, test,
            train_interactions=train,
            user_features=user_features,
            item_features=item_features,
            k=10, num_threads=4
        ).mean()

        if precision > best_precision:
            best_precision = precision
            best_params = params

        print(f"Params: {params}, Precision@10: {precision:.4f}")

    return best_params, best_precision

# Run hyperparameter optimization
best_params, best_precision = optimize_hyperparameters(train, test, user_features, item_features)
print(f"Best parameters: {best_params}")
print(f"Best precision: {best_precision:.4f}")

# Train final model with best parameters
final_model = LightFM(
    loss=best_params['loss'],
    no_components=best_params['no_components'],
    learning_rate=best_params['learning_rate'],
    item_alpha=best_params['item_alpha'],
    user_alpha=best_params['user_alpha'],
    random_state=42
)

final_model.fit(
    train,
    user_features=user_features,
    item_features=item_features,
    epochs=50,  # More epochs for final model
    num_threads=4,
    verbose=True
)

# Evaluate final model
final_precision = precision_at_k(
    final_model, test,
    train_interactions=train,
    user_features=user_features,
    item_features=item_features,
    k=10, num_threads=4
).mean()

final_recall = recall_at_k(
    final_model, test,
    train_interactions=train,
    user_features=user_features,
    item_features=item_features,
    k=10, num_threads=4
).mean()

print(f"Final precision@10: {final_precision:.4f}, recall@10: {final_recall:.4f}")

NameError: name 'best_params' is not defined

## Mejor hiperparametros obtenidos

In [18]:
final_model = LightFM(
    loss='warp',
    no_components=128,
    learning_rate=0.01,
    item_alpha=1e-06,
    user_alpha=0.0001,
    random_state=42
)

final_model.fit(
    train,
    user_features=user_features,
    item_features=item_features,
    epochs=50,
    num_threads=4,
    verbose=True
)
final_precision = precision_at_k(
    final_model, test,
    train_interactions=train,
    user_features=user_features,
    item_features=item_features,
    k=10, num_threads=4
).mean()

final_recall = recall_at_k(
    final_model, test,
    train_interactions=train,
    user_features=user_features,
    item_features=item_features,
    k=10, num_threads=4
).mean()

print(f"Final precision@10: {final_precision:.4f}, recall@10: {final_recall:.4f}")

Epoch: 100%|██████████| 50/50 [00:41<00:00,  1.20it/s]


Final precision@10: 0.1372, recall@10: 0.0750
