### Modelo de Similitud Semántica + Popularidad + Similitud en Features Numéricas

In [18]:
# Import libraries
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import os
from scipy import spatial
import pickle
from tqdm import tqdm
import json
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_openai import ChatOpenAI
from openai import OpenAI
from typing import List, Optional
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Load data
data =pd.read_pickle("/Users/juanmontreuil/Desktop/AI Recipes/food.pkl")
data.shape

(53064, 36)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53064 entries, 157706 to 141979
Data columns (total 36 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   name               53064 non-null  object        
 1   id                 53064 non-null  object        
 2   minutes            53064 non-null  int64         
 3   contributor_id     53064 non-null  object        
 4   submitted          53064 non-null  datetime64[ns]
 5   tags               53064 non-null  object        
 6   nutrition          53064 non-null  object        
 7   n_steps            53064 non-null  int64         
 8   steps              53064 non-null  object        
 9   description        53064 non-null  object        
 10  ingredients        53064 non-null  object        
 11  n_ingredients      53064 non-null  int64         
 12  calories           53064 non-null  float64       
 13  total_fat          53064 non-null  float64       
 14  sugar

Preparación de Embeddings

In [9]:
# Create text labels for embedding
data["label"] = (
    data["name"].fillna('') + " " +
    data["tags"].fillna('') + " " +
    data["description"].fillna('')
)

In [19]:
#Embedding TFIDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['label'])

In [None]:
# Store text embeddings
with open("/Users/juanmontreuil/Desktop/AI Recipes/text_emb.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)

In [26]:
# Store the TFIDF vectorizer
with open("/Users/juanmontreuil/Desktop/AI Recipes/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

In [22]:
# Define feature columns
feature_cols = [
    'minutes', 'n_steps', 'calories', 'protein', 'total_fat', 'sodium', 'saturated_fat',
      'carbohydrates', 'dairy-free', 'gluten-free', 'healthy', 'fast-cooking', 'vegetarian', 'vegan', 'large-portion', 'low-saturated-fat'
]

In [23]:
# Scale numerical features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(data[feature_cols])

Get Input

In [24]:
# Dataset
data = pd.read_pickle("/Users/juanmontreuil/Desktop/AI Recipes/food.pkl")
data.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,...,low-cholesterol,low-saturated-fat,fast-cooking,vegan,vegetarian,large-portion,low-fat,recipe_id,average_rating,votes
157706,red pepper asiago gougere cheese bites,468748,95,166642,2011-11-22,"['time-to-make', 'course', 'main-ingredient', ...","[22.0, 2.0, 0.0, 0.0, 1.0, 4.0, 0.0]",14,"['heat oven to 400 degrees f', 'line baking sh...",these are cheesy and savory puff pastry bites....,...,0,0,0,0,0,0,0,468748,5.0,1
73274,elena s chicken francese,91969,20,9590,2004-05-27,"['30-minutes-or-less', 'time-to-make', 'course...","[383.6, 29.0, 2.0, 26.0, 75.0, 33.0, 4.0]",9,"['place flour in a shallow bowl', 'roll chicke...","yummy, and quick weeknight meal. one of our fa...",...,0,0,0,0,0,0,0,91969,4.0,1
206616,western skillet,150908,45,121795,2006-01-08,"['60-minutes-or-less', 'time-to-make', 'course...","[693.4, 42.0, 34.0, 51.0, 73.0, 64.0, 24.0]",7,"['in a large skillet , cook beef over medium h...",qc j/a 2005,...,0,0,0,0,0,0,0,150908,4.0,2
29931,butterscotch toffee cookies,288112,25,668077,2008-02-23,"['30-minutes-or-less', 'time-to-make', 'course...","[156.9, 12.0, 53.0, 3.0, 2.0, 16.0, 6.0]",8,['mix shortening and both sugars with electric...,"in preparation for a bake sale, i changed a re...",...,0,0,0,0,0,0,0,288112,4.0,1
9300,au gratin hash browns casserole,94740,70,126418,2004-06-30,"['time-to-make', 'course', 'preparation', 'cas...","[441.1, 49.0, 4.0, 32.0, 26.0, 70.0, 8.0]",6,"['oven@ 350', 'in a large bowl combine first 5...","i think i found this recipe in my ""goody bag"" ...",...,0,0,0,0,0,0,0,94740,5.0,1


In [27]:
# Get embeddings
with open("/Users/juanmontreuil/Desktop/AI Recipes/text_emb.pkl", "rb") as f:
    text_emb = pickle.load(f)
with open("/Users/juanmontreuil/Desktop/AI Recipes/tfidf_vectorizer.pkl", "rb") as f:
    tfidf = pickle.load(f)

Funcion output

In [28]:
def recommend_for_new_user(query, n=3, alpha=0.6, beta=0.3, return_scores=False):
    # Embed the query
    query_emb = tfidf.transform([query])

    # Semantic similarity
    sim_text = cosine_similarity(query_emb, text_emb).flatten()

    # Numeric feature similarity
    sim_features = scaled_features.mean(axis=1)

    # Popularity
    pop_score = data["average_rating"].fillna(data["average_rating"].mean()).to_numpy()
    pop_score = pop_score / pop_score.max()

    # Score final
    hybrid_score = alpha * sim_text + beta * sim_features + (1 - alpha - beta) * pop_score
    top_idx = np.argsort(hybrid_score)[::-1][:n]

    # Convert the rating to two decimals
    data["average_rating"] = data["average_rating"].round(2)

    # Output the name, description, ingredients and rating in that order
    result = data.iloc[top_idx][["name", "average_rating", "description", "ingredients", "steps"]].copy()

    # Rearrange columns and rename
    result = result[["name", "description", "ingredients", "steps", "average_rating"]]
    result = result.rename(columns={'name': 'nombre', 'description': 'descripción', 'ingredients': 'ingredientes', 'average_rating': 'calificación_promedio', 'steps': 'instrucciones'})

    return result.reset_index(drop=True)

LLM para traducir y resumir

In [29]:
# Store OpenAI key
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize LLM
llm = ChatOpenAI(model="gpt-4.1-2025-04-14")

# Prompts to translate
translate_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an assistant that translates the name, description, ingredients, instructions, and average_rating of food recipes into Spanish. Respect the JSON format strictly. Also, ensure that you organize the recipe's instructions step by step in numeric bullets. Do not include any comments about your chain of thought."),
    ("user", "Translate the following recipe:\n{var1}")
])

detect_language_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an assistant that detects the language of a given text. If the text is in English, return the exact text without any modifications or additional commentary. If the text is in any other language, translate it to English and return only the translated text in a straightforward manner. Do not include any comments about your chain of thought."),
    ("user", "Detect the language of the following text:\n{query}")
])

# Chains
translate_chain = translate_prompt | llm | StrOutputParser()
detect_language_chain = detect_language_prompt | llm | StrOutputParser()


Funcion final + LLM

In [30]:
# translate to spanish + summarize
def recommend(query, n=3, alpha=0.6, beta=0.3, return_scores=False):
    # Detect language and translate to English if necessary
    query = detect_language_chain.invoke({"query": query})
    # Get recommendations
    recommendations = recommend_for_new_user(query, n, alpha, beta, return_scores)
    # Transform to JSON structure
    recommendations_json = recommendations.to_dict(orient="records")
    # Apply LLM chain
    translated_recipes = []

    for recipe in recommendations_json:
        translated_json = translate_chain.invoke({"var1": json.dumps(recipe, ensure_ascii=False)})
        translated_dict = json.loads(translated_json)  # parsear a dict
        translated_recipes.append(translated_dict)

    print(json.dumps(translated_recipes, indent=2, ensure_ascii=False))


Test model

In [31]:
user_query = "comida japonesa dulce"
recommendations = recommend(user_query, n=3, return_scores=True)
recommendations

[
  {
    "nombre": "salsa japonesa de sésamo para mojar",
    "descripción": "salsa servida con barbacoa japonesa.",
    "ingredientes": "['salsa de soja', 'mirin', 'azúcar', 'aceite de sésamo', 'semillas de sésamo', 'ajo']",
    "instrucciones": [
      "1. Coloca todos los ingredientes en un bol.",
      "2. Mezcla bien."
    ],
    "calificación_promedio": 5.0
  },
  {
    "nombre": "ensalada japonesa de pepino (sunomono)",
    "descripción": "¡Excelente ensalada japonesa para acompañar cualquier comida japonesa, tailandesa o china!",
    "ingredientes": [
      "pepinos",
      "sal",
      "vinagre de arroz",
      "azúcar",
      "salsa de soja",
      "jengibre fresco"
    ],
    "instrucciones": [
      "1. Corta los pepinos en rodajas finas.",
      "2. Colócalos en un bol y espolvorea con sal.",
      "3. Deja reposar a temperatura ambiente durante 30 minutos o hasta que los pepinos estén suaves.",
      "4. Escurre y exprime el líquido sobrante.",
      "5. En un bol para s