# RAG con Azure AI Search

# Librerias


In [None]:
import gc
import json
import os
import pandas as pd
import pickle

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from datasets import load_dataset
from dotenv import load_dotenv
from math import ceil
from openai import AzureOpenAI
from tqdm import tqdm

# Variables de entorno

In [None]:
load_dotenv(override=True)

az_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT_URL")
az_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
az_openai_embeddings_deployment_name = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME")

az_search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT_URL")
az_search_index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
az_search_api_key = os.getenv("AZURE_SEARCH_API_KEY")

# Initialize the Azure OpenAI client
az_openai_client = AzureOpenAI(
    azure_endpoint=az_openai_endpoint,
    api_key=az_openai_api_key,
    api_version="2024-05-01-preview",
)

dataset_file = "recetas_de_la_abuela_dataset.json"
dataset_records = 1000

output_file = "merged_output.jsonl"

# Descargamos el dataset

In [None]:
# Download if not already downloaded
if not os.path.exists(dataset_file):
    # Load the dataset from Hugging Face
    dataset = load_dataset("somosnlp/RecetasDeLaAbuela", "version_1")
    
    # Take only the train split
    dataset_train = dataset["train"]
    
    # Option A: Use select(range(...)) to get the first records
    dataset_train_small = dataset_train.select(range(dataset_records))
    
    # Convert each sample to a dictionary
    dataset_train_list = [dict(sample) for sample in dataset_train_small]

    # Save as a proper JSON array
    with open(dataset_file, "w", encoding="utf-8") as f:
        json.dump(dataset_train_list, f, ensure_ascii=False, indent=4)
    print(f"Dataset with {dataset_records} records saved to {dataset_file}")
else:
    print(f"Dataset already downloaded as {dataset_file}")

# Cargamos el dataset en un dataframe

In [None]:
df = pd.read_json(dataset_file)

# Quick overview of the dataset
print(df.info())
print(df.head())

## Missing values

In [None]:
missing_values_summary = df.isnull().sum()

missing_values_summary

## Seleccionamos y renombramos columnas

In [None]:
selected_columns = ["Id", "Nombre", "URL", "Ingredientes", "Pasos", "Valor nutricional"]

df_subset = df[selected_columns].rename(columns=str.lower)
# Release the source dataframe from memory
del df
df_subset.columns = df_subset.columns.str.replace(" ", "_", regex=False)
df_subset.head()

## Procesamos columnas

In [None]:
df_subset['numero_ingredientes'] = df_subset['ingredientes'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

# id pasa a ser string
df_subset['id'] = df_subset['id'].astype(str)
df_subset.head()

## Vectorizamos campos de interes

In [8]:
BATCH_SIZE = 200
TMP_DIR = "tmp"  # Directory to store embedding files
os.makedirs(TMP_DIR, exist_ok=True)

fields_to_vectorize = ["nombre", "pasos"]
vector_field_name_suffix = "_vector"

vector_fields = [f"{field}{vector_field_name_suffix}" for field in fields_to_vectorize]

def batch_compute_embeddings(
    texts, 
    client=az_openai_client, 
    model_name=az_openai_embeddings_deployment_name
):
    """
    Takes a list of strings and returns a list of embeddings (one per string).
    """
    response = client.embeddings.create(input=texts, model=model_name)
    return [item.embedding for item in response.data]


def vectorize_column_in_chunks(df, column_name, batch_size=BATCH_SIZE, tmp_dir=TMP_DIR):
    """
    1. Check if a *completed* file for this column exists in tmp_root (e.g. tmp/{column_name}_embeddings.pkl).
       If so, skip computing altogether and return the loaded embeddings.
    2. If not, look for partial chunk files in tmp/{column_name}/chunk_{i}.pkl and resume from the first missing chunk.
    3. Merge all chunk files into the completed file in tmp/ once done, and return the embeddings.
    """
    
    # 1) If a fully completed embeddings file exists, skip everything:
    completed_file = os.path.join(tmp_dir, f"{column_name}_embeddings.pkl")
    if os.path.exists(completed_file):
        print(f"[SKIP] Found existing completed file for '{column_name}': {completed_file}")
        with open(completed_file, "rb") as f:
            all_embeddings = pickle.load(f)
        return all_embeddings

    # 2) Otherwise, we need to generate (or resume) chunked files in tmp/{column_name}/
    col_dir = os.path.join(tmp_dir, column_name)
    os.makedirs(col_dir, exist_ok=True)
    
    # Figure out how many total chunks we need
    n = len(df)
    total_chunks = (n + batch_size - 1) // batch_size  # ceiling division

    # Check existing chunks
    existing_chunks = set()
    for fname in os.listdir(col_dir):
        if fname.startswith("chunk_") and fname.endswith(".pkl"):
            # e.g. "chunk_3.pkl" -> chunk index = 3
            try:
                idx_str = fname.replace("chunk_", "").replace(".pkl", "")
                idx = int(idx_str)
                existing_chunks.add(idx)
            except ValueError:
                pass

    # 3) For each chunk, if we haven't computed it, compute & store it.
    print(f"[COMPUTE] Embeddings for '{column_name}' (missing chunks).")
    for i in tqdm(range(total_chunks), desc=f"Vectorizing '{column_name}'"):
        if i in existing_chunks:
            # Skip if this chunk file already exists
            continue

        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, n)
        
        batch_texts = df[column_name].iloc[start_idx:end_idx].fillna("").tolist()
        batch_embeddings = batch_compute_embeddings(batch_texts)

        chunk_file = os.path.join(col_dir, f"chunk_{i}.pkl")
        with open(chunk_file, "wb") as f:
            pickle.dump(batch_embeddings, f)

def run_vectorization(df, df_fields_to_vectorize=fields_to_vectorize):
    for field_id, field in enumerate(df_fields_to_vectorize):
        print(f"\nProcessing field {field_id + 1}/{len(df_fields_to_vectorize)}: {field}")
        vectorize_column_in_chunks(df, field)
    print("\nAll columns processed!")

run_vectorization(df_subset, df_fields_to_vectorize=fields_to_vectorize)

NameError: name 'df_subset' is not defined

## Salvamos dataset a fichero

In [None]:
def create_json_chunked(
    df,
    column_to_dir_map,
    output_json,
    batch_size=BATCH_SIZE
):
    """
    df: DataFrame with the 'regular' columns.
    column_to_dir_map: dict of { "vec_column_name": "path/to/chunked_embeddings_dir" }
                       Example:
                          {
                            "nombre_vector": "tmp/nombre",
                            "ingredientes_vector": "tmp/ingredientes",
                            ...
                          }
    output_json: File path for the final .jsonl output.
    batch_size: Number of rows/embeddings per chunk.
    """
    total_rows = len(df)
    total_chunks = ceil(total_rows / batch_size)
    
    with open(output_json, "w", encoding="utf-8") as f_out:
        for chunk_idx in tqdm(range(total_chunks), desc="Exporting rows"):
            start_idx = chunk_idx * batch_size
            end_idx = min(start_idx + batch_size, total_rows)

            # Slice the DataFrame; copy() to avoid locking references to the original df
            df_chunk = df.iloc[start_idx:end_idx].copy()

            # Load each embedding chunk for this slice
            chunk_embeddings_map = {}
            for vec_col, col_dir in column_to_dir_map.items():
                chunk_file = os.path.join(col_dir, f"chunk_{chunk_idx}.pkl")
                if not os.path.exists(chunk_file):
                    raise FileNotFoundError(f"Chunk file not found: {chunk_file}")

                with open(chunk_file, "rb") as emb_f:
                    embeddings_chunk = pickle.load(emb_f)
                
                chunk_embeddings_map[vec_col] = embeddings_chunk

            # Convert df_chunk to a list of dicts (records)
            records = df_chunk.to_dict("records")

            # local_offset is the index within this chunk
            local_offset = 0
            for row_dict in records:
                # Attach embedding vectors
                for vec_col, emb_list in chunk_embeddings_map.items():
                    row_dict[vec_col] = emb_list[local_offset]
                local_offset += 1

                # Write JSON line
                f_out.write(json.dumps(row_dict, ensure_ascii=False) + "\n")

            # Release memory for this chunk
            del df_chunk
            del chunk_embeddings_map
            del records

            # (Optional) Collect garbage every few chunks
            if chunk_idx % 10 == 0:
                gc.collect()

    print(f"Finished exporting to {output_json}")


vector_column_and_files = { folder_name + vector_field_name_suffix: os.path.join(TMP_DIR, folder_name)
        for folder_name in os.listdir(TMP_DIR)
        if folder_name in fields_to_vectorize
}
print(f"Embeddings in {vector_column_and_files}")
create_json_chunked(df_subset, vector_column_and_files, output_file, batch_size=200)

## Cargamos datos en Azure AI Search

In [12]:
# Initialize client
search_client = SearchClient(endpoint=az_search_endpoint, index_name=az_search_index_name, credential=AzureKeyCredential(az_search_api_key))

# Load the documents from a jsonl file
print(f"Loading documents from {output_file}...")
documents = []
with open(output_file, "r", encoding="utf-8") as f:
    for line in f:
        documents.append(json.loads(line))

# Upload the documents to the index in blocks
batch_size = 300
print(f"Uploading {len(documents)} documents in batches of {batch_size}...")
for i in range(0, len(documents), batch_size):
    print(f"Uploading batch {i // batch_size + 1}...")
    dataset = documents[i:i + batch_size]
    result = search_client.upload_documents(documents=dataset)

Loading documents from merged_output.jsonl...
Uploading 5000 documents in batches of 300...
Uploading batch 1...
Uploading batch 2...
Uploading batch 3...
Uploading batch 4...
Uploading batch 5...
Uploading batch 6...
Uploading batch 7...
Uploading batch 8...
Uploading batch 9...
Uploading batch 10...
Uploading batch 11...
Uploading batch 12...
Uploading batch 13...
Uploading batch 14...


HttpResponseError: () Storage quota has been exceeded for this service. You must either delete documents first, or use a higher SKU for additional quota.
Code: 
Message: Storage quota has been exceeded for this service. You must either delete documents first, or use a higher SKU for additional quota.

## Buscamos en esos datos

In [None]:
def vector_search(query: str, search_client=search_client, az_openai_client=az_openai_client, embedding_model_name=az_openai_embeddings_deployment_name, max_docs=3, vector_fields=vector_fields, select_fields=["nombre, url, pasos, valor_nutricional"]):
    # Vector Search

    query_embeddings = az_openai_client.embeddings.create(input=query, model=embedding_model_name).data[0].embedding
    vector_query = VectorizedQuery(vector=query_embeddings, k_nearest_neighbors=max_docs, fields=','.join(vector_fields))

    results = search_client.search(  
            search_text=None,  
            vector_queries= [vector_query],
            select=select_fields,
            top=max_docs
        ) 

    return results

query = "lomo de cerdo"
results = vector_search(query, max_docs=1)

for result in results:
    print(result['nombre'])


ENROLLADO DE CERDO RELLENO CON TACU TACU
