## Búsqueda Semantica con Azure Search API

### Importación de las librerías básicas necesarias

In [None]:
import json
import openai
import os
import sys
import time
import PyPDF2
import fnmatch
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import SearchIndex
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    ScoringProfile,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
)

### Obtención de variables desde el archivo .env

In [None]:
# Load secrets and config from .env file
load_dotenv("../.env")

# OpenAI API
openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_key = os.getenv("OPENAI_API_KEY")
embedding_model = os.getenv("OPENAI_EMBEDDING_MODEL")
print("OpenAI API key: {}".format(openai.api_key[:5] + '...' + openai.api_key[-5:]))
print("OpenAI API base: {}".format(openai.api_base))
print("OpenAI API version: {}".format(openai.api_version))
print("OpenAI API type: {}".format(openai.api_type))

# Azure Search API
search_service_name = os.getenv("SEARCH_SERVICE_NAME")
search_service_key = os.getenv("SEARCH_SERVICE_KEY")
search_index_name = os.getenv("SEARCH_INDEX_NAME")
search_endpoint = "https://{}.search.windows.net/".format(search_service_name)
search_vector_config_name = os.getenv("SEARCH_VECTOR_CONFIG_NAME")
search_semantic_config_name = os.getenv("SEARCH_SEMANTIC_CONFIG_NAME")
print("Azure Search service name: {}".format(search_service_name))
print("Azure Search service key: {}".format(search_service_key[:5] + '...' + search_service_key[-5:]))
print("Azure Search index name: {}".format(search_index_name))
print("Azure Search endpoint: {}".format(search_endpoint))
print("Azure Search vector config name: {}".format(search_vector_config_name))
print("Azure Search semantic config name: {}".format(search_semantic_config_name))

# Other variables
data_directory = os.getenv("FILEPATH_TO_DATA")
index_schema = os.getenv("FILEPATH_TO_INDEX_SCHEMA")
print("Data directory: {}".format(data_directory))
print("Index schema: {}".format(index_schema))

### Creación de clase "Cliente" para la conexión con la API de Azure Search

In [None]:
# Instantiate a client
class CreateClient(object):
    def __init__(self, endpoint, key, index_name):
        self.endpoint = endpoint
        self.index_name = index_name
        self.key = key
        self.credentials = AzureKeyCredential(key)

    # Create a SearchClient
    # Use this to upload docs to the Index
    def create_search_client(self):
        return SearchClient(
            endpoint=self.endpoint,
            index_name=self.index_name,
            credential=self.credentials,
        )

    # Create a SearchIndexClient
    # This is used to create, manage, and delete an index
    def create_admin_client(self):
        return SearchIndexClient(endpoint=self.endpoint, credential=self.credentials)

### Creación y carga del esquema de búsqueda

In [None]:
# Create Search Index
def create_schema_from_json_and_upload(index_name, admin_client):

    # Define CORS options for the index
    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)

    # Initialize an empty list for scoring profiles
    scoring_profiles = []

    # Define the fields for the search index
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
        SearchableField(name="filename", type=SearchFieldDataType.String),
        SearchableField(name="author", type=SearchFieldDataType.String),
        SearchableField(name="created_date", type=SearchFieldDataType.DateTimeOffset),
        SearchableField(name="last_modified_date", type=SearchFieldDataType.DateTimeOffset),
        SearchableField(name="page_number", type=SearchFieldDataType.Int32),
        SearchableField(name="total_pages", type=SearchFieldDataType.Int32),
        SearchableField(name="content", type=SearchFieldDataType.String),
    ]

    # Define the semantic configuration for the index
    semantic_config = SemanticConfiguration(
        name=search_semantic_config_name,
        prioritized_fields=PrioritizedFields(
            title_field=SemanticField(field_name="filename"),
            prioritized_content_fields=[SemanticField(field_name="content")]
        )
    )

    # Create the semantic settings with the configuration
    semantic_settings = SemanticSettings(configurations=[semantic_config])

    # Create the SearchIndex object
    index = SearchIndex(
        name=index_name,
        fields=fields,
        scoring_profiles=scoring_profiles,
        semantic_settings=semantic_settings,
        cors_options=cors_options,
    )

    try:
        # Upload the schema to the Azure Cognitive Search service
        upload_schema = admin_client.create_index(index)
        if upload_schema:
            print("Schema uploaded successfully.")
        else:
            raise Exception("Schema upload failed.")
    except:
        raise Exception("Unexpected Error. Schema upload failed.")


### Conversión de fecha a formato ISO8601

In [None]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(5))
def convert_date_to_iso8601(date):
    try:
        date = date.strftime("%Y-%m-%dT%H:%M:%SZ")
    except:
        print("Error converting date to ISO8601 format")
        date = None
    return date

### Conversión y carga del contenido de PDFs al índice de búsqueda

In [None]:
# Función para convertir PDFs y cargar en el índice
def convert_pdfs_and_upload_to_index(root_dir, client):
    data_list = []  # Lista para almacenar los datos procesados
    total_files = sum(len(files) for _, _, files in os.walk(root_dir))  # Contador total de archivos
    count = 0  # Contador de progreso

    # Recorrido por directorios y archivos
    for dirpath, dirs, files in os.walk(root_dir):
        for filename in fnmatch.filter(files, '*.pdf'):
            count += 1
            print(f"Processing {filename} {count}/{total_files}")
            pdf_file = os.path.join(dirpath, filename)

            # Apertura del archivo PDF
            with open(pdf_file, 'rb') as fileobj:
                pdf = PyPDF2.PdfReader(fileobj)
                info = pdf.metadata
                author = info.author
                created_date = info.creation_date
                mod_date = info.modification_date
                num_pages = len(pdf.pages)

                # Recorrido por cada página del PDF
                for i in range(num_pages):
                    text = pdf.pages[i].extract_text()

                    # Crear un ID único para cada página
                    id_str = str(count) + "_" + str(i)

                    # Creación de un diccionario con los datos
                    data = {
                        "id": id_str,
                        "filename": os.path.basename(pdf_file),
                        "author": author,
                        "page_number": str(i+1),
                        "total_pages": str(num_pages),
                        "content": text,
                    }

                    # Conversión de fechas a formato ISO8601
                    data["created_date"] = convert_date_to_iso8601(created_date)
                    if not data["created_date"]:
                        continue

                    data["last_modified_date"] = convert_date_to_iso8601(mod_date)
                    if not data["last_modified_date"]:
                        continue

                    data_list.append(data)  # Agregar datos a la lista

    client.upload_documents(documents=data_list)  # Cargar documentos en el índice
    print("Done!")  # Indicar finalización


### Configuración de "Cliente" para la búsqueda

In [None]:
base_client = CreateClient(search_endpoint, search_service_key, search_index_name)
search_client = base_client.create_search_client()
admin_client = base_client.create_admin_client()

### Asignación de esquema de búsqueda y carga de contenido

In [None]:
schema = create_schema_from_json_and_upload(search_index_name, admin_client)
convert_pdfs_and_upload_to_index(data_directory, search_client)
print("Upload complete")

### Búsqueda simple de texto

In [None]:
results =  search_client.search(query_type='simple',
    query_language='es-es',
    search_text="cual utiliza tecnologia javascript y python?" ,
    select='filename,content',
    include_total_count=True)

for result in results:
    print(f"{result['filename']} -> {result['@search.score']}")

### Búsqueda semántica de texto

In [None]:
results =  search_client.search(query_type='semantic', query_language='es-es', semantic_configuration_name='test-search-semantic-config2',
    search_text="cual utiliza tecnologia javascript y python?" ,
    select='filename, content', query_caption='extractive')

for result in results:
    print(result["@search.reranker_score"])
    print(result["filename"])
    
    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")

### Eliminación de índice de búsqueda

In [None]:
# Clean up Azure resources
admin_client.delete_index(search_index_name)
print("Index deleted")