# Ejemplo de código de creacion de indice con Azure Cognitive Search
Este código muestra cómo usar Azure Cognitive Search y el SDK de Azure Python para crear un indice.

## 1.- Setup inicial

### 1.1- Instalar librerías

In [1]:
#! pip install azure-core
#! pip install openai
#! pip install azure-search-documents --pre
#! pip install PyPDF2
#! pip install tenacity
#! pip install openai[datalib]
#! pip install python-dotenv

### 1.2.- Cargar librerías

In [2]:
import json
import openai
import os
import sys
import time
import PyPDF2
import fnmatch
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import SearchIndex
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    ScoringProfile,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
)

### 1.3.- Cargar variables de entorno

In [3]:
# Load secrets and config from .env file
load_dotenv()

# OpenAI API
openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_key = os.getenv("OPENAI_API_KEY")
embedding_model = os.getenv("OPENAI_EMBEDDING_MODEL")
print("OpenAI API key: {}".format(openai.api_key[:5] + '...' + openai.api_key[-5:]))
print("OpenAI API base: {}".format(openai.api_base))
print("OpenAI API version: {}".format(openai.api_version))
print("OpenAI API type: {}".format(openai.api_type))

# Azure Search API
search_service_name = os.getenv("SEARCH_SERVICE_NAME")
search_service_key = os.getenv("SEARCH_SERVICE_KEY")
search_index_name = os.getenv("SEARCH_INDEX_NAME")
search_endpoint = "https://{}.search.windows.net/".format(search_service_name)
search_vector_config_name = os.getenv("SEARCH_VECTOR_CONFIG_NAME")
search_semantic_config_name = os.getenv("SEARCH_SEMANTIC_CONFIG_NAME")
print("Azure Search service name: {}".format(search_service_name))
print("Azure Search service key: {}".format(search_service_key[:5] + '...' + search_service_key[-5:]))
print("Azure Search index name: {}".format(search_index_name))
print("Azure Search endpoint: {}".format(search_endpoint))
print("Azure Search vector config name: {}".format(search_vector_config_name))
print("Azure Search semantic config name: {}".format(search_semantic_config_name))

# Other variables
data_directory = os.getenv("FILEPATH_TO_DATA")
index_schema = os.getenv("FILEPATH_TO_INDEX_SCHEMA")
print("Data directory: {}".format(data_directory))
print("Index schema: {}".format(index_schema))

OpenAI API key: 1c7f2...3613f
OpenAI API base: https://wsl-openai-canada.openai.azure.com/
OpenAI API version: 2023-03-15-preview
OpenAI API type: azure
Azure Search service name: wsl-cog-search-test-2
Azure Search service key: 9GUM9...z1hrb
Azure Search index name: cogsrch-index
Azure Search endpoint: https://wsl-cog-search-test-2.search.windows.net/
Azure Search vector config name: cogsrch-vector-config
Azure Search semantic config name: cogsrch-semantic-config
Data directory: ./data
Index schema: None


### 1.4.- Clase para creación de Clientes para Azure Search

In [4]:
# Instantiate a client
class CreateClient(object):
    def __init__(self, endpoint, key, index_name):
        self.endpoint = endpoint
        self.index_name = index_name
        self.key = key
        self.credentials = AzureKeyCredential(key)

    # Create a SearchClient
    # Use this to upload docs to the Index
    def create_search_client(self):
        return SearchClient(
            endpoint=self.endpoint,
            index_name=self.index_name,
            credential=self.credentials,
        )

    # Create a SearchIndexClient
    # This is used to create, manage, and delete an index
    def create_admin_client(self):
        return SearchIndexClient(endpoint=self.endpoint, credential=self.credentials)

### 1.5.- Clase para creación de indices

In [5]:
# Create Search Index
def create_schema_from_json_and_upload(index_name, admin_client):
    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
    scoring_profiles = []
    
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
        SearchableField(name="filename", type=SearchFieldDataType.String),
        SearchableField(name="author", type=SearchFieldDataType.String),
        SearchableField(name="created_date", type=SearchFieldDataType.DateTimeOffset),
        SearchableField(name="last_modified_date", type=SearchFieldDataType.DateTimeOffset),
        SearchableField(name="page_number", type=SearchFieldDataType.Int32),
        SearchableField(name="total_pages", type=SearchFieldDataType.Int32),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SearchField(name="embeddings", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_configuration=search_vector_config_name),
    ]
    
    vector_search = VectorSearch(
        algorithm_configurations=[
            HnswVectorSearchAlgorithmConfiguration(
                name=search_vector_config_name,
                kind="hnsw",
                parameters={
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 500,
                    "metric": "cosine"
                }
            )
        ]
    )
    
    semantic_config = SemanticConfiguration(
        name=search_semantic_config_name,
        prioritized_fields=PrioritizedFields(
            title_field=SemanticField(field_name="filename"),
            prioritized_content_fields=[SemanticField(field_name="content")]
        )
    )
    
    # Create the semantic settings with the configuration
    semantic_settings = SemanticSettings(configurations=[semantic_config])

    index = SearchIndex(
        name=index_name,
        fields=fields,
        scoring_profiles=scoring_profiles,
        semantic_settings=semantic_settings,
        vector_search=vector_search,
        cors_options=cors_options,
    )

    try:
        upload_schema = admin_client.create_index(index)
        if upload_schema:
            print("Schema uploaded successfully.")
        else:
            raise Exception("Schema upload failed.")
    except:
        raise Exception("Unexpected Error. Schema upload failed.")

### 1.6.- Clase para creación de embeddings (vectores)

In [6]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(5))
def generate_embeddings(text):
    response = openai.Embedding.create(input=text, engine=embedding_model)
    embeddings = response["data"][0]['embedding']
    return embeddings


### 1.7.- Clase para conversion de fecha

In [7]:
def convert_date_to_iso8601(date):
    try:
        date = date.strftime("%Y-%m-%dT%H:%M:%SZ")
    except:
        print("Error converting date to ISO8601 format")
        date = None
    return date

### 1.8.- Clase para conversion de PDF

In [8]:
def convert_pdfs_and_upload_to_index(root_dir, client):
    data_list = []
    total_files = sum(len(files) for _, _, files in os.walk(root_dir))
    count = 0
    
    for dirpath, dirs, files in os.walk(root_dir):
        for filename in fnmatch.filter(files, '*.pdf'):
            count += 1
            print(f"Processing {filename} {count}/{total_files}")
            pdf_file = os.path.join(dirpath, filename)
            
            with open(pdf_file, 'rb') as fileobj:
                pdf = PyPDF2.PdfReader(fileobj)
                info = pdf.metadata
                author = info.author
                created_date = info.creation_date
                mod_date = info.modification_date
                num_pages = len(pdf.pages)

                # loop through each page of the PDF
                for i in range(num_pages):
                    text = pdf.pages[i].extract_text()
                    
                    try:
                        embeddings = generate_embeddings(text)
                    except:
                        print(f"Error generating embeddings for {pdf_file}")
                        continue
                    
                    # create unique id for each page
                    id_str = str(count) + "_" + str(i)
                    
                    data = {
                        "id": id_str,
                        "filename": os.path.basename(pdf_file),
                        "author": author,
                        "page_number": str(i+1),
                        "total_pages": str(num_pages),
                        "content": text,
                        "embeddings": embeddings
                    }
                    
                    data["created_date"] = convert_date_to_iso8601(created_date)
                    if not data["created_date"]:
                        continue
                    
                    data["last_modified_date"] = convert_date_to_iso8601(mod_date)
                    if not data["last_modified_date"]:
                        continue

                    data_list.append(data)
                
    client.upload_documents(documents=data_list)
    print("Done!")

## 2.- Crear indice

### 2.1.- Creacion de instancias de clientes Azure Cognitive Search

In [9]:
base_client = CreateClient(search_endpoint, search_service_key, search_index_name)
search_client = base_client.create_search_client()
admin_client = base_client.create_admin_client()

### 2.2.- Creacion de indice y subida de data

In [11]:
schema = create_schema_from_json_and_upload(search_index_name, admin_client)
convert_pdfs_and_upload_to_index(data_directory, search_client)
print("Upload complete")

Schema uploaded successfully.
Processing DE03587-23.pdf 1/10
Processing DE03590-23.pdf 2/10
Processing DE03591-23.pdf 3/10
Processing DE03592-23.pdf 4/10
Processing DE03596-23.pdf 5/10
Processing DE03597-23.pdf 6/10
Processing DE03598-23.pdf 7/10
Processing DE03599-23.pdf 8/10
Processing DE03610-23.pdf 9/10
Processing DE03612-23.pdf 10/10
Done!
Upload complete


## 3.- Limpieza

In [10]:
# Clean up Azure resources
admin_client.delete_index(search_index_name)
print("Index deleted")

Index deleted
