# Ejemplo de código de búsqueda vectorial de Azure Cognitive Search con Azure OpenAI
Este código muestra cómo usar Azure Cognitive Search con OpenAI y el SDK de Azure Python.
## Requisitos previos

In [1]:
#! pip install azure-core
#! pip install openai
#! pip install azure-search-documents --pre
#! pip install PyPDF2
#! pip install tenacity
#! pip install openai[datalib]
#! pip install python-dotenv

## Importar bibliotecas requeridas

In [2]:
import json
import openai
import os
import sys
import time
import PyPDF2
import fnmatch
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import SearchIndex
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    ScoringProfile,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
)

## Establecimiento de variables de entorno


In [3]:
# Cargar secretos y configuración desde el archivo .env
load_dotenv("../.env")

# OpenAI API
openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_key = os.getenv("OPENAI_API_KEY")
embedding_model = os.getenv("OPENAI_EMBEDDING_MODEL")
print("OpenAI API key: {}".format(openai.api_key[:5] + '...' + openai.api_key[-5:]))
print("OpenAI API base: {}".format(openai.api_base))
print("OpenAI API version: {}".format(openai.api_version))
print("OpenAI API type: {}".format(openai.api_type))

# Azure Search API
search_service_name = os.getenv("SEARCH_SERVICE_NAME")
search_service_key = os.getenv("SEARCH_SERVICE_KEY")
search_index_name = os.getenv("SEARCH_INDEX_NAME")
search_endpoint = "https://{}.search.windows.net/".format(search_service_name)
search_vector_config_name = os.getenv("SEARCH_VECTOR_CONFIG_NAME")
search_semantic_config_name = os.getenv("SEARCH_SEMANTIC_CONFIG_NAME")
print("Azure Search service name: {}".format(search_service_name))
print("Azure Search service key: {}".format(search_service_key[:5] + '...' + search_service_key[-5:]))
print("Azure Search index name: {}".format(search_index_name))
print("Azure Search endpoint: {}".format(search_endpoint))
print("Azure Search vector config name: {}".format(search_vector_config_name))
print("Azure Search semantic config name: {}".format(search_semantic_config_name))

# Otras variables
data_directory = os.getenv("FILEPATH_TO_DATA")
index_schema = os.getenv("FILEPATH_TO_INDEX_SCHEMA")
print("Data directory: {}".format(data_directory))
print("Index schema: {}".format(index_schema))

OpenAI API key: 1c7f2...3613f
OpenAI API base: https://wsl-openai-canada.openai.azure.com/
OpenAI API version: 2023-03-15-preview
OpenAI API type: azure
Azure Search service name: wsl-cog-search-test-2
Azure Search service key: 9GUM9...z1hrb
Azure Search index name: test-index-10
Azure Search endpoint: https://wsl-cog-search-test-2.search.windows.net/
Azure Search vector config name: test-search-vector-config
Azure Search semantic config name: test-search-semantic-config
Data directory: ../data
Index schema: ./index_schema.json


### Clase para creación de Clientes para Azure Search

In [4]:
class CreateClient(object):
    def __init__(self, endpoint, key, index_name):
        self.endpoint = endpoint
        self.index_name = index_name
        self.key = key
        self.credentials = AzureKeyCredential(key)

    # Crear un cliente de búsqueda
    # Use esto para cargar documentos al índice
    def create_search_client(self):
        return SearchClient(
            endpoint=self.endpoint,
            index_name=self.index_name,
            credential=self.credentials,
        )

    # Crear un cliente SearchIndex
    # Esto se utiliza para crear, administrar y eliminar un índice.
    def create_admin_client(self):
        return SearchIndexClient(endpoint=self.endpoint, credential=self.credentials)

### Metodo para creación y carga de "Index" de búsqueda
Un index" de búsqueda es similar a una tabla de base de datos que contiene los datos en los que se puede realizar una búsqueda.

In [5]:
def create_schema_from_json_and_upload(index_name, admin_client):
    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
    scoring_profiles = []
    
    # Configura los campos de datos del index
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
        SearchableField(name="filename", type=SearchFieldDataType.String),
        SearchableField(name="author", type=SearchFieldDataType.String),
        SearchableField(name="created_date", type=SearchFieldDataType.DateTimeOffset),
        SearchableField(name="last_modified_date", type=SearchFieldDataType.DateTimeOffset),
        SearchableField(name="page_number", type=SearchFieldDataType.Int32),
        SearchableField(name="total_pages", type=SearchFieldDataType.Int32),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SearchField(name="embeddings", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_configuration=search_vector_config_name),
    ]
    
    # Crea los ajustes vectoriales para configuración.
    vector_search = VectorSearch(
        algorithm_configurations=[
            HnswVectorSearchAlgorithmConfiguration(
                name=search_vector_config_name,
                kind="hnsw",
                parameters={
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 500,
                    "metric": "cosine"
                }
            )
        ]
    )
    
    # Crea los ajustes semánticos para configuración.
    semantic_config = SemanticConfiguration(
        name=search_semantic_config_name,
        prioritized_fields=PrioritizedFields(
            title_field=SemanticField(field_name="filename"),
            prioritized_content_fields=[SemanticField(field_name="content")]
        )
    )
    semantic_settings = SemanticSettings(configurations=[semantic_config])
    
    # Crea el index de búsqueda
    index = SearchIndex(
        name=index_name,
        fields=fields,
        scoring_profiles=scoring_profiles,
        semantic_settings=semantic_settings,
        vector_search=vector_search,
        cors_options=cors_options,
    )

    # Crea el index en Azure
    try:
        upload_schema = admin_client.create_or_update_index(index)
        if upload_schema:
            print("Schema uploaded successfully.")
        else:
            raise Exception("Schema upload failed.")
    except:
        raise Exception("Unexpected Error. Schema upload failed.")

### Metodo para generación de embeddings

In [6]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(5))
def generate_embeddings(text):
    response = openai.Embedding.create(input=text, engine=embedding_model)
    embeddings = response["data"][0]['embedding']
    return embeddings

def convert_date_to_iso8601(date):
    try:
        date = date.strftime("%Y-%m-%dT%H:%M:%SZ")
    except:
        print("Error converting date to ISO8601 format")
        date = None
    return date

### Metodo para convertir un sets de archivos PDF al index

In [7]:
def convert_pdfs_and_upload_to_index(root_dir, client):
    data_list = []
    total_files = sum(len(files) for _, _, files in os.walk(root_dir))
    count = 0
    
    # Recorrer cada directorio
    for dirpath, dirs, files in os.walk(root_dir):
        # Cada archivo PDF en el directorio
        for filename in fnmatch.filter(files, '*.pdf'):
            
            count += 1
            print(f"Processing {filename} {count}/{total_files}")
            pdf_file = os.path.join(dirpath, filename)
            
            # Leer el PDF     
            with open(pdf_file, 'rb') as fileobj:
                pdf = PyPDF2.PdfReader(fileobj)
                info = pdf.metadata
                author = info.author
                created_date = info.creation_date
                mod_date = info.modification_date
                num_pages = len(pdf.pages)

                # Recorrer cada página del PDF
                for i in range(num_pages):
                    text = pdf.pages[i].extract_text()
                    
                    try:
                        embeddings = generate_embeddings(text)
                    except:
                        print(f"Error generating embeddings for {pdf_file}")
                        continue
                    
                    # crear una identificación única para cada página
                    id_str = str(count) + "_" + str(i)
                    
                    data = {
                        "id": id_str,
                        "filename": os.path.basename(pdf_file),
                        "author": author,
                        "page_number": str(i+1),
                        "total_pages": str(num_pages),
                        "content": text,
                        "embeddings": embeddings
                    }
                    
                    data["created_date"] = convert_date_to_iso8601(created_date)
                    if not data["created_date"]:
                        continue
                    
                    data["last_modified_date"] = convert_date_to_iso8601(mod_date)
                    if not data["last_modified_date"]:
                        continue

                    # Agregar datos (cada pagina) a la lista
                    data_list.append(data)

    # Subir los datos a Azure
    client.upload_documents(documents=data_list)
    print("Done!")

### Metodo para generar busqueda vectorial

In [8]:
def perform_vector_search(search_client, user_query, k):
    
    # Se genera el objeto vectorial de la consulta del usuario con el campo embeddings
    vector = Vector(value=generate_embeddings(user_query), k=k, fields="embeddings")
    
    results = search_client.search(  
        search_text=None,  
        vectors= [vector],
        select=["filename", "content", "page_number","total_pages"],
    )  
    
    # Muestra los resultados de la búsqueda
    for result in results:  
        print(f"Filename: {result['filename']}")  
        print(f"Score: {result['@search.score']}")  
        #print(f"Content: {result['content']}")  
        print(f"page_number: {result['page_number']}")  
        print(f"total_pages: {result['total_pages']}")   

### Creación de Clientes de Búsqueda de Cognitive Search

In [9]:
base_client = CreateClient(search_endpoint, search_service_key, search_index_name)
search_client = base_client.create_search_client()
admin_client = base_client.create_admin_client()

### Convertir PDFs y Subirlos al Index

In [10]:
# Crea el index en Azure
schema = create_schema_from_json_and_upload(search_index_name, admin_client)
# Carga los datos en Azure
convert_pdfs_and_upload_to_index(data_directory, search_client)
print("Upload complete")

Schema uploaded successfully.
Processing DE03234-23.pdf 1/159
Processing DE03276-23.pdf 2/159
Processing DE03279-23.pdf 3/159
Processing DE03280-23.pdf 4/159
Processing DE03307-23.pdf 5/159
Processing DE03308-23.pdf 6/159
Processing DE03327-23.pdf 7/159
Processing DE03336-23.pdf 8/159
Processing DE03347-23.pdf 9/159
Processing DE03348-23.pdf 10/159
Processing DE03360-23.pdf 11/159
Processing DE03363-23.pdf 12/159
Processing DE03367-23.pdf 13/159
Processing DE03368-23.pdf 14/159
Processing DE03382-23.pdf 15/159
Processing DE03383-23.pdf 16/159
Processing DE03392-23.pdf 17/159
Processing DE03401-23.pdf 18/159
Processing DE03404-23.pdf 19/159
Processing DE03407-23.pdf 20/159
Processing DE03409-23.pdf 21/159
Processing DE03425-23.pdf 22/159
Processing DE03427-23.pdf 23/159
Processing DE03428-23.pdf 24/159
Processing DE03430-23.pdf 25/159
Processing DE03433-23.pdf 26/159
Processing DE03439-23.pdf 27/159
Processing DE03440-23.pdf 28/159
Processing DE03441-23.pdf 29/159
Processing DE03450-23.

### Realizar una Búsqueda Vectorial

In [11]:
# Busqueda vectorial
perform_vector_search(search_client=search_client,
                      user_query="2021",
                      k=3)

Filename: DE05054-23.pdf
Score: 0.8320727
page_number: 9
total_pages: 12
Filename: DE05054-23.pdf
Score: 0.8285331
page_number: 12
total_pages: 12
Filename: DE05054-23.pdf
Score: 0.8277373
page_number: 10
total_pages: 12


### Limpiar index utilizado en Cognitive Search

In [12]:
# # Limpiar recursos de Azure
# admin_client.delete_index(search_index_name)
# print("Index deleted")

Index deleted
