### Modelo de Similitud Semántica + Popularidad + Similitud en Features Numéricas

In [66]:
# Import libraries
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import os
import pickle
import json
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_openai import ChatOpenAI

In [13]:
# Load data
data =pd.read_pickle("/Users/juanmontreuil/Desktop/AI Recipes/food.pkl")
data.shape

(212256, 36)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212256 entries, 0 to 212255
Data columns (total 37 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   name               212256 non-null  object        
 1   id                 212256 non-null  object        
 2   minutes            212256 non-null  int64         
 3   contributor_id     212256 non-null  object        
 4   submitted          212256 non-null  datetime64[ns]
 5   tags               212256 non-null  object        
 6   nutrition          212256 non-null  object        
 7   n_steps            212256 non-null  int64         
 8   steps              212256 non-null  object        
 9   description        212256 non-null  object        
 10  ingredients        212256 non-null  object        
 11  n_ingredients      212256 non-null  int64         
 12  calories           212256 non-null  float64       
 13  total_fat          212256 non-null  float64 

Preparación de Embeddings

In [15]:
# Create text labels for embedding
data["label"] = (
    data["name"].fillna('') + " " +
    data["tags"].fillna('') + " " +
    data["description"].fillna('')
)

embedder = SentenceTransformer('all-MiniLM-L6-v2')
text_emb = embedder.encode(data["label"].tolist(), show_progress_bar=True)

Batches: 100%|██████████| 6633/6633 [13:22<00:00,  8.26it/s]


In [56]:
# Store text embeddings and embedder in pkl file
with open("/Users/juanmontreuil/Desktop/AI Recipes/text_emb.pkl", "wb") as f:
    pickle.dump(text_emb, f)
with open("/Users/juanmontreuil/Desktop/AI Recipes/embedder.pkl", "wb") as f:
    pickle.dump(embedder, f)

In [8]:
# Define feature columns
feature_cols = [
    'minutes', 'n_steps', 'calories', 'protein', 'total_fat', 'sodium', 'saturated_fat',
      'carbohydrates', 'dairy-free', 'gluten-free', 'healthy', 'fast-cooking', 'vegetarian', 'vegan', 'large-portion', 'low-saturated-fat'
]

In [9]:
# Scale numerical features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(data[feature_cols])

Get Input

In [59]:
# Dataset
data = pd.read_pickle("/Users/juanmontreuil/Desktop/AI Recipes/food.pkl")
data.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,...,low-cholesterol,low-saturated-fat,fast-cooking,vegan,vegetarian,large-portion,low-fat,recipe_id,average_rating,votes
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,...,0,0,0,0,1,0,0,137739,5.0,3
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,...,0,0,0,0,0,0,0,31490,4.666667,3
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,...,0,0,0,0,0,0,0,112140,4.0,1
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...",...,0,0,0,0,0,0,0,59389,4.5,2
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,...,0,0,0,0,1,0,0,44061,5.0,1


In [60]:
# Get embeddings
with open("/Users/juanmontreuil/Desktop/AI Recipes/text_emb.pkl", "rb") as f:
    text_emb = pickle.load(f)
with open("/Users/juanmontreuil/Desktop/AI Recipes/embedder.pkl", "rb") as f:
    embedder = pickle.load(f)

Funcion output

In [61]:
def recommend_for_new_user(query, n=3, alpha=0.6, beta=0.3, return_scores=False):
    # Embed the query
    query_emb = embedder.encode([query])

    # Semantic similarity
    sim_text = cosine_similarity(query_emb, text_emb).flatten()

    # Numeric feature similarity
    sim_features = scaled_features.mean(axis=1)

    # Popularity
    pop_score = data["average_rating"].fillna(data["average_rating"].mean()).to_numpy()
    pop_score = pop_score / pop_score.max()

    # Score final
    hybrid_score = alpha * sim_text + beta * sim_features + (1 - alpha - beta) * pop_score
    top_idx = np.argsort(hybrid_score)[::-1][:n]

    # Convert the rating to two decimals
    data["average_rating"] = data["average_rating"].round(2)

    # Output the name, description, ingredients and rating in that order
    result = data.iloc[top_idx][["name", "average_rating", "description", "ingredients", "steps"]].copy()

    # Rearrange columns and rename
    result = result[["name", "description", "ingredients", "steps", "average_rating"]]
    result = result.rename(columns={'name': 'nombre', 'description': 'descripción', 'ingredients': 'ingredientes', 'average_rating': 'calificación_promedio', 'steps': 'instrucciones'})

    return result.reset_index(drop=True)

LLM para traducir y resumir

In [62]:
# Store OpenAI key
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize LLM
llm = ChatOpenAI(model="gpt-4.1-2025-04-14")

# Prompts to translate
translate_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an assistant that translates the name, description, ingredients, instructions, and average_rating of food recipes into Spanish. Respect the JSON format strictly. Also, ensure that you organize the recipe's instructions step by step in numeric bullets. Do not include any comments about your chain of thought."),
    ("user", "Translate the following recipe:\n{var1}")
])

detect_language_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an assistant that detects the language of a given text. If the text is in English, return the exact text without any modifications or additional commentary. If the text is in any other language, translate it to English and return only the translated text in a straightforward manner. Do not include any comments about your chain of thought."),
    ("user", "Detect the language of the following text:\n{query}")
])

# Chains
translate_chain = translate_prompt | llm | StrOutputParser()
detect_language_chain = detect_language_prompt | llm | StrOutputParser()


Funcion final + LLM

In [63]:
# translate to spanish + summarize
def recommend(query, n=3, alpha=0.6, beta=0.3, return_scores=False):
    # Detect language and translate to English if necessary
    query = detect_language_chain.invoke({"query": query})
    # Get recommendations
    recommendations = recommend_for_new_user(query, n, alpha, beta, return_scores)
    # Transform to JSON structure
    recommendations_json = recommendations.to_dict(orient="records")
    # Apply LLM chain
    translated_recipes = []

    for recipe in recommendations_json:
        translated_json = translate_chain.invoke({"var1": json.dumps(recipe, ensure_ascii=False)})
        translated_dict = json.loads(translated_json)  # parsear a dict
        translated_recipes.append(translated_dict)

    print(json.dumps(translated_recipes, indent=2, ensure_ascii=False))


Test model

In [65]:
user_query = "comida japonesa dulce"
recommendations = recommend(user_query, n=3, return_scores=True)
recommendations

[
  {
    "nombre": "ya taki sushi",
    "descripción": "una combinación fantástica descubierta por accidente un día. Esto también se puede poner dentro de bolas de arroz.",
    "ingredientes": [
      "arroz para sushi",
      "nori",
      "batata",
      "setas shiitake",
      "salsa de soja",
      "mirin",
      "aceite de sésamo"
    ],
    "instrucciones": [
      "1. Hierve la batata hasta que esté tierna.",
      "2. Cocina al vapor las setas hasta que estén tiernas.",
      "3. Coloca la batata, las setas y los líquidos en un bol y mézclalos bien, puedes machacarlos o batirlos juntos.",
      "4. Si tienes una esterilla para sushi, ponla sobre la mesa; si no, coloca un paño de cocina limpio y cúbrelo con film transparente.",
      "5. Coloca una hoja de nori encima con el lado brillante hacia abajo, si no está tostada puedes pasarla brevemente por una llama.",
      "6. Pon una capa de arroz para sushi encima.",
      "7. Para sushi delgado, extiende el arroz en una capa de 