# Web Scrapping

Vickiann Jiménez

In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re


In [111]:
from urllib.request import urlopen
from bs4 import BeautifulSoup


In [115]:
url = "https://www.allrecipes.com/recipes-a-z-6735880"
html = urlopen(url)

In [116]:
soup = BeautifulSoup(html, 'lxml')
type(soup)


bs4.BeautifulSoup

In [117]:
# Título de la página
title = soup.title
print(title)


<title>Recipes A-Z | Allrecipes.com</title>


Links encontrados en la página

In [118]:
recipies_links= soup.find_all('a', href = True)
print(recipies_links)

[<a class="mntl-skip-to-content mntl-text-link" data-tracking-container="true" href="#main" id="mntl-skip-to-content_1-0" rel="nocaes"><span class="link__wrapper">Skip to content</span></a>, <a aria-label="Visit Allrecipes' homepage" href="/" id="header-logo_1-0" rel="home nocaes"> <div class="is-screenreader-only">Allrecipes</div>
<svg class="icon icon-logo">
<use href="#icon-logo" xlink:href="#icon-logo" xmlns:xlink="http://www.w3.org/1999/xlink"></use>
</svg></a>, <a class="mntl-utility-nav__sublist-link" href="https://www.allrecipes.com/authentication/login?regSource=3675&amp;relativeRedirectUrl=%2Frecipes-a-z-6735880" rel="nofollow nocaes"> <svg class="icon icon-account">
<use href="#icon-account" xlink:href="#icon-account" xmlns:xlink="http://www.w3.org/1999/xlink"></use>
</svg>
<span class="mntl-utility-nav__sublist-link-text">
Log In
</span>
</a>, <a class="mntl-utility-nav__sublist-link" href="https://www.allrecipes.com/authentication/logout?relativeRedirectUrl=%2Frecipes-a-z-

Todos los links encontrados en la página

In [119]:

all_links = soup.find_all("a")
for link in all_links:
    print(link.get("href"))


#main
/
https://www.allrecipes.com/authentication/login?regSource=3675&relativeRedirectUrl=%2Frecipes-a-z-6735880
https://www.allrecipes.com/authentication/logout?relativeRedirectUrl=%2Frecipes-a-z-6735880
/account/profile
/favorites
/account/add-recipe
https://websupport.meredith.com/
https://www.magazines.com/allrecipes-magazine.html?utm_source=allrecipes.com&utm_medium=owned&utm_campaign=i111arr1w2661
https://w1.buysub.com/servlet/CSGateway?cds_mag_code=ALR
https://www.magazines.com/allrecipes-magazine.html
https://www.magazines.com/customer-care
#
/sweepstakes
/
https://www.allrecipes.com/recipes/17562/dinner/
https://www.allrecipes.com/recipes/17057/everyday-cooking/more-meal-ideas/5-ingredients/main-dishes/
https://www.allrecipes.com/recipes/15436/everyday-cooking/one-pot-meals/
https://www.allrecipes.com/recipes/1947/everyday-cooking/quick-and-easy/
https://www.allrecipes.com/recipes/455/everyday-cooking/more-meal-ideas/30-minute-meals/
https://www.allrecipes.com/recipes/17889/e

Encontrar los urls con /recipes/

In [120]:
# Filtrar enlaces que contienen la palabra "recipes"
filtered_links = [link['href'] for link in all_links if '/recipes/' in link['href']]

# Mostrar el número de enlaces filtrados y algunos ejemplos
print(f"Cantidad de enlaces que contienen 'recipes': {len(filtered_links)}")
print("Algunos enlaces filtrados:", filtered_links[:10])  # Mostrar los primeros 10 enlaces


Cantidad de enlaces que contienen 'recipes': 480
Algunos enlaces filtrados: ['https://www.allrecipes.com/recipes/17562/dinner/', 'https://www.allrecipes.com/recipes/17057/everyday-cooking/more-meal-ideas/5-ingredients/main-dishes/', 'https://www.allrecipes.com/recipes/15436/everyday-cooking/one-pot-meals/', 'https://www.allrecipes.com/recipes/1947/everyday-cooking/quick-and-easy/', 'https://www.allrecipes.com/recipes/455/everyday-cooking/more-meal-ideas/30-minute-meals/', 'https://www.allrecipes.com/recipes/17889/everyday-cooking/family-friendly/family-dinners/', 'https://www.allrecipes.com/recipes/94/soups-stews-and-chili/', 'https://www.allrecipes.com/recipes/16099/everyday-cooking/comfort-food/', 'https://www.allrecipes.com/recipes/80/main-dish/', 'https://www.allrecipes.com/recipes/22992/everyday-cooking/sheet-pan-dinners/']


In [121]:
# Crear el DataFrame con tres columnas: 'categoria' y 'url'
data = {
    "categoria": [re.search(r'/([^/]+)/$', link).group(1) for link in filtered_links],  # Extraer la categoría
    "url": filtered_links
}

df = pd.DataFrame(data)

# Ajustar configuración de pandas para no truncar las URLs
pd.set_option('display.max_colwidth', None)  # No truncar las cadenas largas en las columnas

# Mostrar el DataFrame
print(df.head())  # Mostrar los primeros 5 registros

         categoria  \
0           dinner   
1      main-dishes   
2    one-pot-meals   
3   quick-and-easy   
4  30-minute-meals   

                                                                                                    url  
0                                                      https://www.allrecipes.com/recipes/17562/dinner/  
1  https://www.allrecipes.com/recipes/17057/everyday-cooking/more-meal-ideas/5-ingredients/main-dishes/  
2                              https://www.allrecipes.com/recipes/15436/everyday-cooking/one-pot-meals/  
3                              https://www.allrecipes.com/recipes/1947/everyday-cooking/quick-and-easy/  
4              https://www.allrecipes.com/recipes/455/everyday-cooking/more-meal-ideas/30-minute-meals/  


En cada enlace encontrar los subenlaces, buscar las recetas 

In [122]:
# Crear un diccionario para almacenar los subenlaces encontrados en cada página
all_sub_links = {}
total_extracted_links = 0  # Contador para el total de enlaces extraídos
max_links = 600  # Límite máximo de enlaces a extraer

# Recorrer los enlaces filtrados
for link in filtered_links:
    if total_extracted_links >= max_links:  # Detener si alcanzamos el límite
        break

    try:
        print(f"Visitando: {link}")
        sub_html = urlopen(link)  # Abrir el enlace
        sub_soup = BeautifulSoup(sub_html, 'lxml')

        # Encontrar todos los elementos <a> que son recetas
        recipe_items = sub_soup.find_all('a', class_='mntl-card-list-items')

        sub_links = []
        for item in recipe_items:
            # Verificar si el contenedor tiene rating
            if item.find('div', class_='mntl-recipe-star-rating'):  # Verifica que contiene estrellas de rating
                if item.has_attr('href'):  # Verificar que tenga atributo href
                    sub_links.append(item['href'])

        # Limitar los enlaces si estamos cerca del límite total
        remaining_links = max_links - total_extracted_links
        if len(sub_links) > remaining_links:
            sub_links = sub_links[:remaining_links]

        # Guardar los enlaces encontrados en un diccionario
        all_sub_links[link] = sub_links
        total_extracted_links += len(sub_links)  # Incrementar el contador
        print(f"Enlaces encontrados con rating en {link}: {len(sub_links)}")
    except Exception as e:
        print(f"Error al visitar {link}: {e}")

# Mostrar resultados
print(f"\nTotal de enlaces extraídos con rating: {total_extracted_links}")
for main_link, sub_links in all_sub_links.items():
    print(f"\nEnlace principal: {main_link}")
    print(f"Enlaces encontrados: {len(sub_links)}")
    print("Ejemplo de enlaces:", sub_links[:5])  # Mostrar los primeros 5 enlaces

Visitando: https://www.allrecipes.com/recipes/17562/dinner/
Enlaces encontrados con rating en https://www.allrecipes.com/recipes/17562/dinner/: 48
Visitando: https://www.allrecipes.com/recipes/17057/everyday-cooking/more-meal-ideas/5-ingredients/main-dishes/
Enlaces encontrados con rating en https://www.allrecipes.com/recipes/17057/everyday-cooking/more-meal-ideas/5-ingredients/main-dishes/: 67
Visitando: https://www.allrecipes.com/recipes/15436/everyday-cooking/one-pot-meals/
Enlaces encontrados con rating en https://www.allrecipes.com/recipes/15436/everyday-cooking/one-pot-meals/: 60
Visitando: https://www.allrecipes.com/recipes/1947/everyday-cooking/quick-and-easy/
Enlaces encontrados con rating en https://www.allrecipes.com/recipes/1947/everyday-cooking/quick-and-easy/: 48
Visitando: https://www.allrecipes.com/recipes/455/everyday-cooking/more-meal-ideas/30-minute-meals/
Enlaces encontrados con rating en https://www.allrecipes.com/recipes/455/everyday-cooking/more-meal-ideas/30-min

In [123]:
# Combinar todos los subenlaces en una lista plana
recipe_links = [sub_link for sub_links in all_sub_links.values() for sub_link in sub_links]

print(f"Total de enlaces recopilados: {len(recipe_links)}")
print("Ejemplo de enlaces:", recipe_links[:5])  # Mostrar los primeros 5 enlaces

Total de enlaces recopilados: 552
Ejemplo de enlaces: ['https://www.allrecipes.com/chicken-bacon-ranch-tater-tot-casserole-recipe-8751351', 'https://www.allrecipes.com/homemade-spaghetti-o-s-recipe-8695137', 'https://www.allrecipes.com/sloppy-joe-cornbread-casserole-recipe-8672792', 'https://www.allrecipes.com/buffalo-chicken-tater-tot-casserole-recipe-8672248', 'https://www.allrecipes.com/trout-tacos-recipe-8651121']


Extraer los datos de las páginas de recetas encontradas

In [124]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

# Lista para almacenar los datos extraídos
recipes_data = []

for link in recipe_links:
    try:
        print(f"Procesando: {link}")
        html = urlopen(link)  # Abrir el enlace
        soup = BeautifulSoup(html, 'lxml')

        # Extraer Recipe ID desde el enlace (al final o entre diagonales)
        recipe_id_match = re.search(r'/(\d+)/|(\d+)$', link)
        recipe_id = recipe_id_match.group(1) if recipe_id_match and recipe_id_match.group(1) else (
            recipe_id_match.group(2) if recipe_id_match and recipe_id_match.group(2) else 'Unknown'
        )
        # Extraer nombre de la receta
        recipe_name_tag = soup.find('h1', class_='article-heading text-headline-400')
        recipe_name = recipe_name_tag.text.strip() if recipe_name_tag else 'Unknown'

        # Extraer lista de ingredientes
        ingredients_list = soup.find('ul', class_='mm-recipes-structured-ingredients__list')
        ingredients = [
            li.text.strip() for li in ingredients_list.find_all('li')
        ] if ingredients_list else ['No ingredients found']

        # Extraer pasos de preparación
        steps_div = soup.find('div', class_='mm-recipes-steps')
        preparation_steps = [
            step.text.strip() for step in steps_div.find_all('p')
        ] if steps_div else ['No preparation steps found']

        # Almacenar los datos en una lista
        recipes_data.append({
            'Recipe ID': recipe_id,
            'Recipe Name': recipe_name,
            'Ingredients': ingredients,
            'Preparation Steps': preparation_steps,
            'URL': link
        })

    except Exception as e:
        print(f"Error procesando {link}: {e}")

# Crear un DataFrame con los datos extraídos
df_recipes = pd.DataFrame(recipes_data)

# Guardar el DataFrame en un archivo CSV
df_recipes.to_csv('extracted_recipes.csv', index=False)

# Mostrar el DataFrame directamente en la consola
print(df_recipes.head())  # Muestra las primeras 5 filas


Procesando: https://www.allrecipes.com/chicken-bacon-ranch-tater-tot-casserole-recipe-8751351
Procesando: https://www.allrecipes.com/homemade-spaghetti-o-s-recipe-8695137
Procesando: https://www.allrecipes.com/sloppy-joe-cornbread-casserole-recipe-8672792
Procesando: https://www.allrecipes.com/buffalo-chicken-tater-tot-casserole-recipe-8672248
Procesando: https://www.allrecipes.com/trout-tacos-recipe-8651121
Procesando: https://www.allrecipes.com/slow-cooker-honey-garlic-chicken-noodles-recipe-8629517
Procesando: https://www.allrecipes.com/teriyaki-salmon-bowl-recipe-8624279
Procesando: https://www.allrecipes.com/marry-me-chicken-tortellini-recipe-8601280
Procesando: https://www.allrecipes.com/copycat-mcdonalds-filet-o-fish-sandwich-recipe-8580443
Procesando: https://www.allrecipes.com/parmesan-crusted-baked-fish-recipe-8584575
Procesando: https://www.allrecipes.com/chicken-apple-sausage-sheet-pan-dinner-recipe-8558130
Procesando: https://www.allrecipes.com/recipe/212498/easy-chicken-a

Guardar los datos en un archivo CSV

In [23]:
# Cargar el archivo CSV
df_recipes = pd.read_csv('extracted_recipes.csv')

# Eliminar duplicados basándose en el Recipe ID
# La opción `keep='first'` mantiene la primera aparición del duplicado
df_cleaned = df_recipes.drop_duplicates(subset=['Recipe ID'], keep='first')

# Guardar el DataFrame depurado en un nuevo archivo CSV
df_cleaned.to_csv('cleaned_recipes.csv', index=False)

# Mostrar resultados
print(f"Total original de recetas: {len(df_recipes)}")
print(f"Total después de eliminar duplicados: {len(df_cleaned)}")

Total original de recetas: 552
Total después de eliminar duplicados: 498


In [4]:
# Cargar datos
df_data_recipes = pd.read_csv('cleaned_recipes.csv')

## Preprocesamiento de Datos

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = stop_words = set(stopwords.words('english'))

In [16]:
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenizar palabras
    tokens = word_tokenize(text)
    # Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens


In [17]:
df_data_recipes.head()

Unnamed: 0,Recipe ID,Recipe Name,Ingredients,Preparation Steps,URL
0,8751351,Chicken Bacon Ranch Tater Tot Casserole,"['cooking spray', '1 (32 ounce) package frozen...",['Gather all ingredients. Preheat oven to 450 ...,https://www.allrecipes.com/chicken-bacon-ranch...
1,8695137,Homemade Spaghetti-O's,['16 ounces ring-shaped pasta \xa0(Anelli Sici...,['Fill a large pot with lightly salted water a...,https://www.allrecipes.com/homemade-spaghetti-...
2,8672792,Sloppy Joe Cornbread Casserole,"['cooking spray', '2 pounds ground chuck', '2 ...",['Gather all ingredients. Preheat the oven to ...,https://www.allrecipes.com/sloppy-joe-cornbrea...
3,8672248,Buffalo Chicken Tater Tot Casserole,"['cooking spray', '1 (10.5 ounce) can cream of...",['Gather all ingredients. Preheat oven to 425 ...,https://www.allrecipes.com/buffalo-chicken-tat...
4,8651121,Trout Tacos,"['16 white corn tortillas', '1/2 pound purple ...",['Preheat the oven to 400 degrees F (200 degre...,https://www.allrecipes.com/trout-tacos-recipe-...


In [18]:
df_data_recipes['df_final'] = df_data_recipes['Preparation Steps'].apply(
    lambda x: preprocess_text(x) if isinstance(x, str) else ""
)


In [58]:
df_data_recipes.head()

Unnamed: 0,Recipe ID,Recipe Name,Ingredients,Preparation Steps,URL,df_final,Preparation Steps Embeddings
0,8751351,Chicken Bacon Ranch Tater Tot Casserole,"['cooking spray', '1 (32 ounce) package frozen...",['Gather all ingredients. Preheat oven to 450 ...,https://www.allrecipes.com/chicken-bacon-ranch...,"[gather, ingredients, preheat, oven, 450, degr...","[-0.013500303, -0.07850488, -0.028869798, -0.0..."
1,8695137,Homemade Spaghetti-O's,['16 ounces ring-shaped pasta \xa0(Anelli Sici...,['Fill a large pot with lightly salted water a...,https://www.allrecipes.com/homemade-spaghetti-...,"[fill, large, pot, lightly, salted, water, bri...","[-0.025534445, 0.04242385, 0.0067807348, -0.01..."
2,8672792,Sloppy Joe Cornbread Casserole,"['cooking spray', '2 pounds ground chuck', '2 ...",['Gather all ingredients. Preheat the oven to ...,https://www.allrecipes.com/sloppy-joe-cornbrea...,"[gather, ingredients, preheat, oven, 350, degr...","[-0.06077512, -0.049491934, 0.019328661, -0.05..."
3,8672248,Buffalo Chicken Tater Tot Casserole,"['cooking spray', '1 (10.5 ounce) can cream of...",['Gather all ingredients. Preheat oven to 425 ...,https://www.allrecipes.com/buffalo-chicken-tat...,"[gather, ingredients, preheat, oven, 425, degr...","[-0.014817715, -0.09954384, -0.02537461, -0.08..."
4,8651121,Trout Tacos,"['16 white corn tortillas', '1/2 pound purple ...",['Preheat the oven to 400 degrees F (200 degre...,https://www.allrecipes.com/trout-tacos-recipe-...,"[preheat, oven, 400, degrees, f, 200, degrees,...","[-0.05037915, 0.0353466, -0.0022075884, -0.040..."


## Recuperación con ChromaDB

In [37]:
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [38]:
# Cargar el modelo de embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [39]:
# Generar embeddings para la columna "Preparation Steps"
df_data_recipes['Preparation Steps Embeddings'] = df_data_recipes['Preparation Steps'].apply(
    lambda x: model.encode(x) if isinstance(x, str) else np.zeros((384,))
)

In [40]:
# Crear la lista de embeddings
embeddings = np.array(df_data_recipes['Preparation Steps Embeddings'].tolist())

In [71]:
def insert_documents_to_chromadb(df):
    """
    Inserta documentos en ChromaDB utilizando los datos del DataFrame.

    Args:
        df (pd.DataFrame): DataFrame que contiene las recetas y sus metadatos.

    Returns:
        None
    """
    # Generar IDs únicos para cada documento
    ids = df['Recipe ID'].astype(str).tolist()

    # Convertir las listas de palabras de 'df_final' a cadenas de texto
    documents = [" ".join(words) if isinstance(words, list) else "" for words in df['df_final']]

    # Generar metadatos a partir de las columnas relevantes
    metadatas = [
        {
            "Recipe ID": row['Recipe ID'],
            "Recipe Name": row['Recipe Name'],
            "Ingredients": row['Ingredients'],
            "Preparation Steps": row['Preparation Steps'],
            "URL": row['URL']
        }
        for _, row in df.iterrows()
    ]

    # Usar embeddings de la columna "Preparation Steps Embeddings"
    embeddings = np.array(df['Preparation Steps Embeddings'].tolist())

    # Verificar que los datos sean consistentes
    if len(ids) != len(documents) or len(ids) != len(embeddings) or len(ids) != len(metadatas):
        print("Error: Longitudes de datos no coinciden.")
        return

    # Insertar los documentos en la colección de ChromaDB
    collection.add(
        ids=ids,
        documents=documents,
        embeddings=embeddings.tolist(),
        metadatas=metadatas
    )
    print(f"Insertados {len(documents)} documentos en ChromaDB.")


In [73]:
# Insertar los documentos en ChromaDB
insert_documents_to_chromadb(df_data_recipes)

Insert of existing embedding ID: 8751351
Insert of existing embedding ID: 8695137
Insert of existing embedding ID: 8672792
Insert of existing embedding ID: 8672248
Insert of existing embedding ID: 8651121
Insert of existing embedding ID: 8629517
Insert of existing embedding ID: 8624279
Insert of existing embedding ID: 8601280
Insert of existing embedding ID: 8580443
Insert of existing embedding ID: 8584575
Insert of existing embedding ID: 8558130
Insert of existing embedding ID: 212498
Insert of existing embedding ID: 7567945
Insert of existing embedding ID: 7558696
Insert of existing embedding ID: 7508957
Insert of existing embedding ID: 7485475
Insert of existing embedding ID: 83646
Insert of existing embedding ID: 158799
Insert of existing embedding ID: 8509102
Insert of existing embedding ID: 8508920
Insert of existing embedding ID: 255462
Insert of existing embedding ID: 245210
Insert of existing embedding ID: 215231
Insert of existing embedding ID: 268494
Insert of existing embed

Insertados 498 documentos en ChromaDB.


In [74]:
print(f"Tamaño de la colección: {collection.count()}")


Tamaño de la colección: 996


### Realizar Consultas


In [96]:
def search_chromadb(query, top_k=5):
    """
    Realiza una consulta en ChromaDB y devuelve los resultados en un DataFrame.

    Args:
        query (str): Texto de la consulta.
        top_k (int): Número de resultados más cercanos a devolver.

    Returns:
        pd.DataFrame: DataFrame con los resultados de la consulta.
    """
    # Generar embedding para la consulta
    query_embedding = model.encode(query)

    # Buscar los vectores más cercanos
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["metadatas", "distances"]
    )

    # Verificar la estructura de los resultados
    if not results.get("metadatas") or not results.get("distances"):
        print("Error: No se encontraron resultados o los resultados no contienen las claves necesarias.")
        print("Resultados devueltos:", results)
        return pd.DataFrame()

    # Formatear los resultados en un DataFrame
    result_list = []
    for metadata_list, distance_list in zip(results["metadatas"], results["distances"]):
        for metadata, distance in zip(metadata_list, distance_list):
            # Extraer valores de los metadatos
            recipe_id = metadata.get("Recipe ID", "N/A")
            recipe_name = metadata.get("Recipe Name", "N/A")
            ingredients = metadata.get("Ingredients", "N/A")
            preparation_steps = metadata.get("Preparation Steps", "N/A")
            url = metadata.get("URL", "N/A")

            # Agregar al resultado
            result_list.append({
                "Recipe ID": recipe_id,
                "Recipe Name": recipe_name,
                "Ingredients": ingredients,
                "Preparation Steps": preparation_steps,
                "URL": url,
                "Distance": distance
            })

    # Crear el DataFrame
    if not result_list:
        print("No se encontraron resultados.")
        return pd.DataFrame()

    df_results = pd.DataFrame(result_list)

    # Ordenar por la columna "Distance"
    if "Distance" in df_results.columns:
        df_results = df_results.sort_values(by="Distance", ascending=True)

    return df_results


In [97]:
query = "How to make spaghetti?"
df_results = search_chromadb(query, top_k=5)

if not df_results.empty:
    print(df_results)
else:
    print("No se encontraron resultados para la consulta.")


   Recipe ID                                  Recipe Name  \
0    8750088  Creamy Gochujang Spaghetti With Ground Beef   
1    8750088  Creamy Gochujang Spaghetti With Ground Beef   
2     222582                              Baked Spaghetti   
3     222582                              Baked Spaghetti   
4    8763564                Mongolian Ground Beef Noodles   

                                         Ingredients  \
0                                                N/A   
1  ['12 ounces spaghetti', '1 pound ground beef',...   
2                                                N/A   
3  ['1 (16 ounce) package spaghetti', '1 pound gr...   
4                                                N/A   

                                   Preparation Steps  \
0  ['Bring a large pot of salted water to a boil,...   
1  ['Bring a large pot of salted water to a boil,...   
2  ['Gather all ingredients. Preheat the oven to ...   
3  ['Gather all ingredients. Preheat the oven to ...   
4  ['Bring a lar

In [81]:
results = collection.query(
    query_embeddings=[model.encode(query)],
    n_results=5,
    include=["metadatas", "distances"]
)
print(results)


{'ids': [['429', '8750088', '405', '222582', '423']], 'embeddings': None, 'documents': None, 'uris': None, 'data': None, 'metadatas': [[{'Preparation Steps': "['Bring a large pot of salted water to a boil, and cook spaghetti, stirring occasionally, until tender with a bite, about 12 minutes. Scoop out 1/2 cup pasta water; set aside. Drain spaghetti.', 'Cook and stir ground beef, bell pepper, and onion together in a large skillet, until meat and vegetables are browned, 6 to 7 minutes. Add garlic and cook until fragrant, about 1 minute. Drain any excess grease and discard.', 'Add tomato paste and gochujang to meat, bring to a simmer, and cook, stirring constantly, about 3 to 4 minutes. Add heavy whipping cream, and cook and stir until fully incorporated. Season with salt and pepper.', 'Add cooked spaghetti to meat mixture; stir in Parmesan cheese. If sauce is too thick, add reserved pasta water, 2 tablespoons at a time, to thin to desired consistency.']", 'Recipe ID': 8750088, 'Recipe Na

In [86]:
print(f"Tamaño de la colección: {collection.count()}")


Tamaño de la colección: 996


In [87]:
collections = chroma_client.list_collections()
print(collections)


['recipes_preparation_collection']


In [88]:
results = collection.query(
    query_embeddings=[model.encode(query)],
    n_results=5,
    include=["metadatas", "distances"]
)
print("Resultados:", results)


Resultados: {'ids': [['429', '8750088', '405', '222582', '423']], 'embeddings': None, 'documents': None, 'uris': None, 'data': None, 'metadatas': [[{'Preparation Steps': "['Bring a large pot of salted water to a boil, and cook spaghetti, stirring occasionally, until tender with a bite, about 12 minutes. Scoop out 1/2 cup pasta water; set aside. Drain spaghetti.', 'Cook and stir ground beef, bell pepper, and onion together in a large skillet, until meat and vegetables are browned, 6 to 7 minutes. Add garlic and cook until fragrant, about 1 minute. Drain any excess grease and discard.', 'Add tomato paste and gochujang to meat, bring to a simmer, and cook, stirring constantly, about 3 to 4 minutes. Add heavy whipping cream, and cook and stir until fully incorporated. Season with salt and pepper.', 'Add cooked spaghetti to meat mixture; stir in Parmesan cheese. If sauce is too thick, add reserved pasta water, 2 tablespoons at a time, to thin to desired consistency.']", 'Recipe ID': 8750088