In [56]:
import os
import requests
import xml.etree.ElementTree as ET
import re
import fitz 
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

from openai import OpenAI
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

import faiss

  from .autonotebook import tqdm as notebook_tqdm


### Llenar el txt de páginas

In [20]:
# Cargar el archivo .xml
tree = ET.parse('sitemap_1.xml')
root = tree.getroot()
print(root)

# Es la ruta del root
namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

# Regex para obtener solo las urls de recetas
pattern = re.compile(r'^https://www\.allrecipes\.com/recipe/\d+/.+/$')

# Buscar desde la raíz guardada en namespace, todos los elementos con url
urls = root.findall('ns:url/ns:loc', namespaces=namespace)

# Limiar las url que se guardan
max_urls = 500
count = 1

# Escribir en urls.txt
with open('urls.txt', 'w', encoding='utf-8') as file:
    for url in urls:
        url_text = url.text.strip()
        if pattern.match(url_text):
            file.write(url_text + '\n')
            count += 1
            if count >= max_urls:
                break  # salir cuando llegue a max_urls

<Element '{http://www.sitemaps.org/schemas/sitemap/0.9}urlset' at 0x0000017982D2F7E0>


### Obtener los archivos en base al txt

In [21]:
# Dirección del txt de donde lee y la carpeta donde guarda
urls_file = "urls.txt"
output_dir = "saved_html"

# Para que el servidor pinese que la solicitud es de navegador (no funcionó sin esto)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
}
# Usar el encabezad http
with open(urls_file, "r", encoding="utf-8") as f:
    url_list = [line.strip() for line in f if line.strip()]

# Recorrer las url
for i, url in enumerate(url_list, start=1):
    try:
        response = requests.get(url, headers=headers, timeout=10) # Hacer solicitud
        response.raise_for_status()                               # Si la solicitud da error, tirar excepción 
        file_name = os.path.join(output_dir, f"page_{i}.html")    # Crear el archivo donde se va a guardar
        # Guardar el archivo
        with open(file_name, "w", encoding="utf-8") as f:         
            f.write(response.text)
        print(f"Saved: {file_name}")
    except Exception as e:
        print(f"Error with {url}: {e}")


Saved: saved_html\page_1.html
Saved: saved_html\page_2.html
Saved: saved_html\page_3.html
Saved: saved_html\page_4.html
Saved: saved_html\page_5.html
Saved: saved_html\page_6.html
Saved: saved_html\page_7.html
Saved: saved_html\page_8.html
Saved: saved_html\page_9.html
Saved: saved_html\page_10.html
Saved: saved_html\page_11.html
Saved: saved_html\page_12.html
Saved: saved_html\page_13.html
Saved: saved_html\page_14.html
Saved: saved_html\page_15.html
Saved: saved_html\page_16.html
Saved: saved_html\page_17.html
Saved: saved_html\page_18.html
Saved: saved_html\page_19.html
Saved: saved_html\page_20.html
Saved: saved_html\page_21.html
Saved: saved_html\page_22.html
Saved: saved_html\page_23.html
Saved: saved_html\page_24.html
Saved: saved_html\page_25.html
Saved: saved_html\page_26.html
Saved: saved_html\page_27.html
Saved: saved_html\page_28.html
Saved: saved_html\page_29.html
Saved: saved_html\page_30.html
Saved: saved_html\page_31.html
Saved: saved_html\page_32.html
Saved: saved_html

### Extraer la información de los html

In [22]:

def extract_information(html):
    soup = BeautifulSoup(html, "html.parser")

    # Recuperar título del platillo
    title = soup.find("meta", {"property": "og:title"})["content"]

    # Recuperar descripción del platillo
    description = soup.find("meta", {"property": "og:description"})["content"]

    # Recuperar los ingredientes
    ingredient_section = soup.find_all("li", class_="mm-recipes-structured-ingredients__list-item")
    ingredients = [ingredient.text.strip() for ingredient in ingredient_section]

    # Recuperar la valoración del platillo
    valoration_section = soup.find("div", attrs={
        "data-tracking-category": "User Recipe Action",
        "class": "comp mm-recipes-review-bar__rating mntl-text-block text-label-300"
    })
    valoration = valoration_section.text.strip() if valoration_section else None

    # Inicializar variables de time y servings
    time = None
    servings = None

    # Buscar 'Total Time' y 'Servings'
    for item in soup.select(".mm-recipes-details__item"):
        label = item.select_one(".mm-recipes-details__label").text.strip().rstrip(':')
        value = item.select_one(".mm-recipes-details__value").text.strip()

        if label == "Total Time":
            time = value
        if label == "Servings":
            servings = value

    return {
        "title": title,
        "description": description,
        "ingredients": ingredients,
        "valoration": valoration,
        "time": time,
        "servings": servings
    }


### Construir corpus en dataframe

In [25]:
folder_path = "./saved_html"

# Guardar el corpus
corpus = []

# Iterar sobre cada archivo .html
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            html = file.read()
            info = extract_information(html)
            corpus.append(info)
    except Exception as e:
        print(f"Error en {filename}: {e}")

# Crear dataframe 
df = pd.DataFrame(corpus)

df

Unnamed: 0,title,description,ingredients,valoration,time,servings
0,Grilled Italian Sausage,A cast iron skillet is all you'll need to make...,"[1 (12 ounce) package Italian sausage links, 2...",4.0,25 mins,3
1,Watermelon and Tomato Feta Salad,This watermelon and tomato feta salad recipe i...,"[1 (4 pound) seedless watermelon, peeled and c...",4.1,20 mins,8
2,Coffee Ice Cream,This coffee ice cream recipe steeps coffee in ...,"[2 cups heavy cream, divided, ¾ cup coffee bea...",5.0,18 hrs 45 mins,8
3,Stuffed Eggplant Parmesan,This stuffed eggplant recipe features a fillin...,"[2 medium eggplants, cut in half lengthwise an...",4.6,1 hr 5 mins,4
4,Radish Green Frittata,Radish greens and leeks are sautéed and then f...,"[1 tablespoon olive oil, 2 ounces leeks, white...",4.0,40 mins,4
...,...,...,...,...,...,...
494,Easy Grilled Chicken Wings,These easy grilled chicken wings are perfect f...,"[20 chicken wings, 2 tablespoons olive oil, o...",5.0,40 mins,20
495,Apricot Lavender Jam,This delicious apricot jam with lavender makes...,"[4 ½ cups white sugar, divided, 1 tablespoon d...",5.0,1 day 8 hrs 55 mins,128
496,Summer Strawberry Buckle,This simple strawberry buckle is the ultimate ...,"[¾ cup white sugar, ¼ cup butter, softened, 1 ...",4.5,45 mins,8
497,Chef John's Stuffed Peppers,This is the best stuffed peppers recipe that s...,"[1 cup uncooked long grain white rice, 2 cups ...",4.8,1 hr 50 mins,8


### Preprocesamiento

In [49]:
df['raw'] = df['title'] + '. ' + df['description'] + ' Ingredients: ' + df['ingredients'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))

df

Unnamed: 0,title,description,ingredients,valoration,time,servings,raw
0,Grilled Italian Sausage,A cast iron skillet is all you'll need to make...,"[1 (12 ounce) package Italian sausage links, 2...",4.0,25 mins,3,Grilled Italian Sausage. A cast iron skillet i...
1,Watermelon and Tomato Feta Salad,This watermelon and tomato feta salad recipe i...,"[1 (4 pound) seedless watermelon, peeled and c...",4.1,20 mins,8,Watermelon and Tomato Feta Salad. This waterme...
2,Coffee Ice Cream,This coffee ice cream recipe steeps coffee in ...,"[2 cups heavy cream, divided, ¾ cup coffee bea...",5.0,18 hrs 45 mins,8,Coffee Ice Cream. This coffee ice cream recipe...
3,Stuffed Eggplant Parmesan,This stuffed eggplant recipe features a fillin...,"[2 medium eggplants, cut in half lengthwise an...",4.6,1 hr 5 mins,4,Stuffed Eggplant Parmesan. This stuffed eggpla...
4,Radish Green Frittata,Radish greens and leeks are sautéed and then f...,"[1 tablespoon olive oil, 2 ounces leeks, white...",4.0,40 mins,4,Radish Green Frittata. Radish greens and leeks...
...,...,...,...,...,...,...,...
494,Easy Grilled Chicken Wings,These easy grilled chicken wings are perfect f...,"[20 chicken wings, 2 tablespoons olive oil, o...",5.0,40 mins,20,Easy Grilled Chicken Wings. These easy grilled...
495,Apricot Lavender Jam,This delicious apricot jam with lavender makes...,"[4 ½ cups white sugar, divided, 1 tablespoon d...",5.0,1 day 8 hrs 55 mins,128,Apricot Lavender Jam. This delicious apricot j...
496,Summer Strawberry Buckle,This simple strawberry buckle is the ultimate ...,"[¾ cup white sugar, ¼ cup butter, softened, 1 ...",4.5,45 mins,8,Summer Strawberry Buckle. This simple strawber...
497,Chef John's Stuffed Peppers,This is the best stuffed peppers recipe that s...,"[1 cup uncooked long grain white rice, 2 cups ...",4.8,1 hr 50 mins,8,Chef John's Stuffed Peppers. This is the best ...


In [78]:
# Diccionario de fracciones Unicode a float
unicode_fractions = {
    '½': 0.5,
    '⅓': 1/3,
    '⅔': 2/3,
    '¼': 0.25,
    '¾': 0.75,
    '⅕': 0.2,
    '⅖': 0.4,
    '⅗': 0.6,
    '⅘': 0.8,
    '⅙': 1/6,
    '⅚': 5/6,
    '⅛': 0.125,
    '⅜': 0.375,
    '⅝': 0.625,
    '⅞': 0.875,
}

# Función para convertir números fraccionarios en unicode a decimales
def convert_unicode_fractions(text):
    def replace_match(match):
        parts = match.group(0).split()
        if len(parts) == 2:
            # Ej. "4 ½"
            whole = int(parts[0])
            fraction = unicode_fractions.get(parts[1], 0)
            decimal = whole + fraction
        else:
            # Ej. "½"
            decimal = unicode_fractions.get(parts[0], 0)
        
        return f"{round(decimal, 2):.2f}"  # redondeo y formato a 2 decimales

    # Buscar patrones como "4 ½" o "½"
    pattern = re.compile(r'(\d+\s)?[{}]'.format(''.join(unicode_fractions.keys())))
    return pattern.sub(replace_match, text)

In [57]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_doc(doc):
    tokens = regexp_tokenize(doc.lower(), r'\w+')                       # tokenizar
    tokens = [token for token in tokens if token not in stop_words]     # eliminar stopwords
    tokens = [lemmatizer.lemmatize(t) for t in tokens]                  # Lematización
    return ' '.join(tokens)

In [83]:
df["raw_w/o_unicode"] = df["raw"].apply(convert_unicode_fractions)
df["preprocessed"] = df["raw_w/o_unicode"].apply(preprocess_doc)
df

Unnamed: 0,title,description,ingredients,valoration,time,servings,raw,raw_w/o_unicode,preprocessed
0,Grilled Italian Sausage,A cast iron skillet is all you'll need to make...,"[1 (12 ounce) package Italian sausage links, 2...",4.0,25 mins,3,Grilled Italian Sausage. A cast iron skillet i...,Grilled Italian Sausage. A cast iron skillet i...,grilled italian sausage cast iron skillet need...
1,Watermelon and Tomato Feta Salad,This watermelon and tomato feta salad recipe i...,"[1 (4 pound) seedless watermelon, peeled and c...",4.1,20 mins,8,Watermelon and Tomato Feta Salad. This waterme...,Watermelon and Tomato Feta Salad. This waterme...,watermelon tomato feta salad watermelon tomato...
2,Coffee Ice Cream,This coffee ice cream recipe steeps coffee in ...,"[2 cups heavy cream, divided, ¾ cup coffee bea...",5.0,18 hrs 45 mins,8,Coffee Ice Cream. This coffee ice cream recipe...,Coffee Ice Cream. This coffee ice cream recipe...,coffee ice cream coffee ice cream recipe steep...
3,Stuffed Eggplant Parmesan,This stuffed eggplant recipe features a fillin...,"[2 medium eggplants, cut in half lengthwise an...",4.6,1 hr 5 mins,4,Stuffed Eggplant Parmesan. This stuffed eggpla...,Stuffed Eggplant Parmesan. This stuffed eggpla...,stuffed eggplant parmesan stuffed eggplant rec...
4,Radish Green Frittata,Radish greens and leeks are sautéed and then f...,"[1 tablespoon olive oil, 2 ounces leeks, white...",4.0,40 mins,4,Radish Green Frittata. Radish greens and leeks...,Radish Green Frittata. Radish greens and leeks...,radish green frittata radish green leek sautée...
...,...,...,...,...,...,...,...,...,...
494,Easy Grilled Chicken Wings,These easy grilled chicken wings are perfect f...,"[20 chicken wings, 2 tablespoons olive oil, o...",5.0,40 mins,20,Easy Grilled Chicken Wings. These easy grilled...,Easy Grilled Chicken Wings. These easy grilled...,easy grilled chicken wing easy grilled chicken...
495,Apricot Lavender Jam,This delicious apricot jam with lavender makes...,"[4 ½ cups white sugar, divided, 1 tablespoon d...",5.0,1 day 8 hrs 55 mins,128,Apricot Lavender Jam. This delicious apricot j...,Apricot Lavender Jam. This delicious apricot j...,apricot lavender jam delicious apricot jam lav...
496,Summer Strawberry Buckle,This simple strawberry buckle is the ultimate ...,"[¾ cup white sugar, ¼ cup butter, softened, 1 ...",4.5,45 mins,8,Summer Strawberry Buckle. This simple strawber...,Summer Strawberry Buckle. This simple strawber...,summer strawberry buckle simple strawberry buc...
497,Chef John's Stuffed Peppers,This is the best stuffed peppers recipe that s...,"[1 cup uncooked long grain white rice, 2 cups ...",4.8,1 hr 50 mins,8,Chef John's Stuffed Peppers. This is the best ...,Chef John's Stuffed Peppers. This is the best ...,chef john stuffed pepper best stuffed pepper r...


### Carga del modelo de embedding

In [84]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [86]:
query = "pepperoni pizza"
query_emb = model.encode(query)
print(query_emb)

[-1.11345619e-01  4.19089608e-02 -4.07155789e-02  8.55102763e-02
 -6.86945096e-02  2.94043496e-02  9.83209349e-03  4.03417237e-02
  1.40147125e-02 -1.23610355e-01  5.37547953e-02 -9.69437584e-02
  3.45474184e-02  1.31831132e-02  2.65819840e-02 -8.78858864e-02
  1.04220927e-01  3.53138102e-03  2.85466458e-03 -7.79616088e-03
 -8.13834295e-02 -4.84555773e-02 -4.30481434e-02 -4.60939715e-03
  5.44216558e-02  1.38844177e-01  4.71816175e-02  3.81750800e-02
 -3.88631523e-02 -4.86818664e-02 -1.02784187e-02  4.66574319e-02
  1.21300012e-01  9.57045052e-03 -2.98526441e-03 -1.25743626e-02
  5.94118387e-02 -4.19300906e-02  1.40979111e-01 -9.82427318e-03
  2.74148677e-02 -1.25679746e-02  9.49107707e-02 -6.40578642e-02
  8.01342577e-02  2.29159873e-02 -6.19957503e-03 -1.13470657e-02
  2.03016195e-02  1.77209750e-02 -5.15672229e-02  3.87585424e-02
 -1.82473902e-02  2.42547765e-02  2.59807035e-02 -7.13633746e-02
  6.92288391e-03 -7.29555078e-03  2.36795358e-02  1.20054130e-02
 -2.32913513e-02 -1.26494

In [87]:
embeddings = model.encode(df['preprocessed'].tolist())
df['embeddings'] = embeddings.tolist()
print(embeddings)

[[-1.1293749e-01 -1.6892465e-02 -1.1840083e-03 ...  2.2141743e-02
  -7.6258980e-02 -1.4191865e-02]
 [-7.6816857e-02 -1.2062738e-03 -1.9378030e-03 ...  9.9157862e-02
   3.0194011e-02 -3.1865072e-03]
 [-3.6427129e-02 -5.2296214e-02  7.5514086e-02 ...  4.1464675e-02
  -1.6839100e-02 -1.5912566e-01]
 ...
 [ 2.7657492e-02 -4.4129960e-02  3.4469545e-02 ... -5.5293165e-02
   2.3899996e-03 -7.9435892e-02]
 [-9.4266221e-02  1.8770717e-03  3.2128070e-02 ...  2.0284139e-02
  -2.1241929e-02  2.5361167e-02]
 [-7.9798952e-02 -6.9448970e-02  2.7667699e-02 ...  4.4131983e-02
  -2.3582913e-02 -6.2495892e-05]]


### Implementación FAISS para VectorDB

In [88]:
dimension = embeddings.shape[1]
print(dimension)

index = faiss.IndexFlatL2(dimension)  # Distancia euclideana

index.add(np.array(embeddings))       # Agregar los embeddings al índice
print(index)

384
<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x0000017A643C3000> >


### Búsqueda con FAISS

In [89]:
K = 10
D, I = index.search(np.array([query_emb], dtype='float32'), K)

# Solo imprimir para ver que onda
for i in I[0]:
    print(df["title"][i])

print(I)

Pepperoni Pizza Pasta Salad
Stuffed Cherry Peppers
Pizza Sauce with Fresh Tomatoes
Rachel's Pizza Pasta Salad
Chef John's Stuffed Peppers
Kelsey's Favorite Stuffed Green Peppers
Breaded Zucchini Pizza Bites
Italian Meatball Sandwich Casserole
Greek Watermelon Pizza
Best Tomato Pie
[[ 68  19  38 411 497 367 291 288 490  64]]


### LLM

In [None]:
# Objeto tipo cliente de OpenAi
client = OpenAI(api_key="***")

In [92]:
print(I[0])
context = "\n\n".join(df.loc[I[0], "raw_w/o_unicode"].values)
print(context)

[ 68  19  38 411 497 367 291 288 490  64]
Pepperoni Pizza Pasta Salad. If you like pepperoni pizza with bell peppers and mushrooms, you'll love this pasta salad. Customize it to match your favorite pizza! Ingredients: 12 ounces penne pasta, 1 small green bell pepper, chopped, 1 small red bell pepper, chopped, 1  tomato, chopped, 1 (4 ounce) can sliced mushrooms, drained, 0.25 cup sliced black olives, 1 (3 ounce) package pepperoni slices, cut into quarters, 0.50 cup mayonnaise, 0.50 cup Italian-style salad dressing, 2 cloves garlic, minced, 1 teaspoon dried oregano, 0.50 teaspoon salt, 0.25 teaspoon ground black pepper, 1 cup shredded mozzarella cheese, 2 tablespoons grated Parmesan cheese

Stuffed Cherry Peppers. These stuffed cherry peppers are filled with prosciutto and provolone cheese for a delicious bite-sized Italian appetizer bursting with flavor. Ingredients: 1 cup extra virgin olive oil, 12  fresh cherry peppers, 6 ounces sharp provolone cheese, cubed, 6 ounces prosciutto, thi

In [93]:
prompt = f"""Eres una aplicación de tipo Retrieval-Augmented Generation (RAG) que siempre responde en español.
Tu tarea es ayudar al usuario a encontrar la receta que necesita, utilizando únicamente el contexto proporcionado. 
La respuesta debe incluir el nombre de la receta, una breve descripción y los ingredientes.
Si la información solicitada no se encuentra en el contexto, responde con: "Lo siento, no encontré una receta relacionada con tu consulta."

Contexto:
{context}

Pregunta del usuario:
{query}
"""

In [94]:
response = client.responses.create(
    model="gpt-4.1",
    input= prompt
)

print(response.output_text)

Nombre de la receta: Pepperoni Pizza Pasta Salad

Descripción: Si te gusta la pizza de pepperoni con pimientos y champiñones, te encantará esta ensalada de pasta. ¡Puedes personalizarla para que coincida con tus ingredientes favoritos de pizza!

Ingredientes:
- 12 onzas de pasta penne
- 1 pimiento verde pequeño, picado
- 1 pimiento rojo pequeño, picado
- 1 tomate, picado
- 1 lata (4 onzas) de champiñones en rodajas, escurridos
- 1/4 taza de aceitunas negras en rodajas
- 1 paquete (3 onzas) de rodajas de pepperoni, cortadas en cuartos
- 1/2 taza de mayonesa
- 1/2 taza de aderezo tipo italiano para ensalada
- 2 dientes de ajo, picados
- 1 cucharadita de orégano seco
- 1/2 cucharadita de sal
- 1/4 cucharadita de pimienta negra molida
- 1 taza de queso mozzarella rallado
- 2 cucharadas de queso parmesano rallado
