In this notebook, we are going to create the index for storing our embeddings for all the documents.  This is just the setup of the index, we are not loading data into the index yet.

Make sure you have the azure-search-documents installed!

In [None]:
%pip install azure-search-documents
%pip install --upgrade azure-search-documents

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions
from datetime import datetime, timedelta
import os
from dotenv import load_dotenv
from pathlib import Path

# Get root directory path
root_dir = Path().absolute().parent
env_path = root_dir / '.env'

# Load .env from root
load_dotenv(dotenv_path=env_path)
print(f"Loaded .env from {env_path}")
# Access variables
# Azure Storage settings

storage_account_name = os.getenv("STORAGE_ACCOUNT_NAME")  
storage_account_key = os.getenv("STORAGE_ACCOUNT_KEY")  # Add your storage account key here
container_name = "source"

ai_search_endpoint = os.environ["AZURE_SEARCH_ENDPOINT"]
ai_search_key = os.environ["AZURE_SEARCH_KEY"]
ai_search_admin_key = os.environ["AZURE_SEARCH_ADMIN_KEY"]
ai_search_index = "rdc-contracts-v1"

print(f"storage_account_name: {  storage_account_name }")
print(f"storage acct Key: {  storage_account_key[:4] + '*' * 5 + storage_account_key[-4:] }")
print(f"container_name: {container_name}")
print(f"ai_search_endpoint: {ai_search_endpoint}")
print(f"ai_search_key: {ai_search_key[:4] + '*' * 5 + ai_search_key[-4:]}")
print(f"ai_search_index: {ai_search_index}")


Loaded .env from c:\Users\rickcau\source\repos\vendor-contracts-gen-ai\.env
storage_account_name: stgclarivatecw
storage acct Key: ayze*****NQ==
container_name: source
ai_search_endpoint: https://rdc-ai-search.search.windows.net
ai_search_key: n8p2*****IUzQ
ai_search_index: rdc-contacts-v1


# Important Note
You will likely want to create the index with scoring profiles which is done in the V2 Index Creation Code, which is after this.  If you run this code and want to using scoring profiles, you will need use the V2 version.

Logic to create the index...  This is V1 with no scoring profile.  In order to perform Hybrid searches we really need a scoring profile, so the V2 version after this is the code that should be used to create the index.

In [None]:
import logging
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient 
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex
)
from azure.core.exceptions import ServiceRequestError, ResourceExistsError, ResourceNotFoundError

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def create_search_index(
    endpoint: str,
    admin_key: str,
    index_name: str
) -> None:
    """
    Creates an Azure Cognitive Search index with specified fields and vector search capability.
    
    Args:
        endpoint (str): Azure Cognitive Search endpoint URL
        admin_key (str): Admin API key for Azure Cognitive Search
        index_name (str): Name of the index to create
    """
    # Input validation
    if not all([endpoint, admin_key, index_name]):
        raise ValueError("Endpoint, admin key, and index name are required")

    # Initialize the search index client
    try:
        logger.info(f"Initializing SearchIndexClient with endpoint: {endpoint}")
        search_index_client = SearchIndexClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(admin_key)
        )
    except Exception as e:
        logger.error(f"Failed to initialize SearchIndexClient: {str(e)}")
        raise

    # Check if index exists
    try:
        search_index_client.get_index(index_name)
        logger.info(f"Index '{index_name}' already exists")
        return
    except ResourceNotFoundError:
        logger.info(f"Creating new index '{index_name}'...")
    except Exception as e:
        logger.error(f"Error checking index existence: {str(e)}")
        raise

    try:
        # Define the fields for the index
        fields = [
            SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True),
            SimpleField(name="contractId", type=SearchFieldDataType.String, filterable=True),
            SimpleField(name="vendorName", type=SearchFieldDataType.String, filterable=True, searchable=True),
            SimpleField(name="clientName", type=SearchFieldDataType.String, filterable=True, searchable=True),
            SimpleField(name="contractTitle", type=SearchFieldDataType.String, filterable=True, searchable=True),
            SimpleField(name="effectiveDate", type=SearchFieldDataType.DateTimeOffset, filterable=True, facetable=True),
            SimpleField(name="endDate", type=SearchFieldDataType.DateTimeOffset, filterable=True, facetable=True),
            SimpleField(name="signingDate", type=SearchFieldDataType.DateTimeOffset, filterable=True),
            SimpleField(name="status", type=SearchFieldDataType.String, filterable=True, searchable=True),
            SimpleField(name="compensation", type=SearchFieldDataType.Double, filterable=True),
            SimpleField(name="parentContractId", type=SearchFieldDataType.String, filterable=True),
            SimpleField(name="amendmentNumber", type=SearchFieldDataType.String),
            SimpleField(name="creationdate", type=SearchFieldDataType.DateTimeOffset, filterable=True),
            SimpleField(name="sourceFileName", type=SearchFieldDataType.String, filterable=True, searchable=True),
            SearchableField(name="content", type=SearchFieldDataType.String),
            SearchField(name="searchVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    dimensions=1536,  # This is required
                    vector_search_dimensions=1536,
                    vector_search_profile_name="myHnswProfile"
                )
        ]

        # Define vector search settings
        vector_search = VectorSearch(
            algorithms=[
                HnswAlgorithmConfiguration(
                    name="myHnsw",
                    parameters={
                        "m": 4,
                        "efConstruction": 400,
                        "efSearch": 500,
                        "metric": "cosine"
                    }
                )
            ],
            profiles=[
                VectorSearchProfile(
                    name="myHnswProfile",
                    algorithm_configuration_name="myHnsw"
                )
            ]
        )
        
        # Define semantic settings
        semantic_config = SemanticConfiguration(
            name="default",
            prioritized_fields=SemanticPrioritizedFields(
                title_field=SemanticField(field_name="vendorName"),
                keywords_fields=[SemanticField(field_name="contractTitle"),
                                SemanticField(field_name="clientName")],
                content_fields=[SemanticField(field_name="content")]
            )
        )
        
        # Create semantic search configuration
        semantic_search = SemanticSearch(
            configurations=[semantic_config]
        )

        # Create the index with the defined fields and vector search settings
        index = SearchIndex(
            name=index_name,
            fields=fields,
            vector_search=vector_search,
            semantic_search=semantic_search
        )
        
        logger.info("Attempting to create or update index...")
        result = search_index_client.create_or_update_index(index)
        logger.info(f"Successfully created/updated index '{index_name}'")
        return result

    except ServiceRequestError as e:
        logger.error(f"Service request error while creating index: {str(e)}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error while creating index: {str(e)}")
        raise

if __name__ == "__main__":
    # Example usage
    try:
        create_search_index(
            endpoint=ai_search_endpoint,
            admin_key=ai_search_admin_key ,
            index_name=ai_search_index
        )
    except Exception as e:
        logger.error(f"Failed to create index: {str(e)}")

Version 2 of the Create Index, in this code we add the logic to create the scoring profile for the plain text searches.

In [6]:
import logging
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SemanticConfiguration, 
    SemanticField, 
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SearchIndex,
    ScoringProfile,
    TextWeights,
    FreshnessScoringFunction,
    MagnitudeScoringFunction,
    ScoringFunctionAggregation,
    FreshnessScoringParameters,
    MagnitudeScoringParameters,
    ScoringFunctionInterpolation
)

from azure.core.exceptions import ServiceRequestError, ResourceExistsError, ResourceNotFoundError

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def create_search_index(
    endpoint: str,
    admin_key: str,
    index_name: str
) -> None:
    """
    Creates an Azure Cognitive Search index with specified fields, vector search capability,
    and scoring profiles.
    
    Args:
        endpoint (str): Azure Cognitive Search endpoint URL
        admin_key (str): Admin API key for Azure Cognitive Search
        index_name (str): Name of the index to create
    """
    # Input validation
    if not all([endpoint, admin_key, index_name]):
        raise ValueError("Endpoint, admin key, and index name are required")

    # Initialize the search index client
    try:
        logger.info(f"Initializing SearchIndexClient with endpoint: {endpoint}")
        search_index_client = SearchIndexClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(admin_key)
        )
    except Exception as e:
        logger.error(f"Failed to initialize SearchIndexClient: {str(e)}")
        raise

    # Check if index exists
    try:
        search_index_client.get_index(index_name)
        logger.info(f"Index '{index_name}' already exists")
        return
    except ResourceNotFoundError:
        logger.info(f"Creating new index '{index_name}'...")
    except Exception as e:
        logger.error(f"Error checking index existence: {str(e)}")
        raise

    try:
        # Define the fields for the index
        fields = [
            SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True),
            SimpleField(name="contractId", type=SearchFieldDataType.String, filterable=True),
            SearchableField(name="vendorName", type=SearchFieldDataType.String, filterable=True),
            SearchableField(name="clientName", type=SearchFieldDataType.String, filterable=True),
            SearchableField(name="contractTitle", type=SearchFieldDataType.String, filterable=True),  # Changed to SearchableField
            SimpleField(name="effectiveDate", type=SearchFieldDataType.DateTimeOffset, filterable=True, facetable=True),
            SimpleField(name="endDate", type=SearchFieldDataType.DateTimeOffset, filterable=True, facetable=True),
            SimpleField(name="signingDate", type=SearchFieldDataType.DateTimeOffset, filterable=True),
            SearchableField(name="status", type=SearchFieldDataType.String, filterable=True),
            SimpleField(name="compensation", type=SearchFieldDataType.Double, filterable=True),
            SimpleField(name="parentContractId", type=SearchFieldDataType.String, filterable=True),
            SimpleField(name="amendmentNumber", type=SearchFieldDataType.String),
            SimpleField(name="creationdate", type=SearchFieldDataType.DateTimeOffset, filterable=True),
            SearchableField(name="sourceFileName", type=SearchFieldDataType.String, filterable=True),
            SearchableField(name="content", type=SearchFieldDataType.String),
            SearchField(
                name="vendorNameVector",
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                vector_search_dimensions=1536,
                vector_search_profile_name="myHnswProfile"
            )
        ]

        # Define vector search settings
        vector_search = VectorSearch(
            algorithms=[
                HnswAlgorithmConfiguration(
                    name="myHnsw",
                    parameters={
                        "m": 4,
                        "efConstruction": 400,
                        "efSearch": 500,
                        "metric": "cosine"
                    }
                )
            ],
            profiles=[
                VectorSearchProfile(
                    name="myHnswProfile",
                    algorithm_configuration_name="myHnsw"
                )
            ]
        )

        # Define scoring profiles
        # scoring_profiles = [
        #     # Hybrid scoring profile for combining text and vector search
        #     ScoringProfile(
        #         name="hybridScoring",
        #         text_weights=TextWeights(
        #             weights={
        #                 "contractTitle": 5.0,
        #                 "vendorName": 3.0,
        #                 "clientName": 2.0,
        #                 "content": 1.0
        #             }
        #         ),
        #         function_aggregation=ScoringFunctionAggregation.SUM
        #     ),
            
        #     # Recency scoring profile that favors newer documents
        #     ScoringProfile(
        #         name="recencyScoring",
        #         text_weights=TextWeights(
        #             weights={
        #                 "contractTitle": 3.0,
        #                 "content": 1.0
        #             }
        #         ),
        #         functions=[
        #             FreshnessScoringFunction(
        #                 field_name="creationdate",
        #                 boost=2.0,
        #                 parameters=FreshnessScoringParameters(
        #                     boosting_duration="P365D"
        #                 ),
        #                 interpolation=ScoringFunctionInterpolation.LINEAR
        #             )
        #         ],
        #         function_aggregation=ScoringFunctionAggregation.SUM
        #     ),
            
        #     # Value scoring profile that favors higher-value contracts
        #     ScoringProfile(
        #         name="valueScoring",
        #         text_weights=TextWeights(
        #             weights={
        #                 "contractTitle": 3.0,
        #                 "content": 1.0
        #             }
        #         ),
        #         functions=[
        #             MagnitudeScoringFunction(
        #                 field_name="compensation",
        #                 boost=2.0,
        #                 parameters=MagnitudeScoringParameters(
        #                     boosting_range_start=1000,    # Minimum compensation value
        #                     boosting_range_end=100000,    # Maximum compensation value
        #                     constant=1
        #                 ),
        #                 interpolation=ScoringFunctionInterpolation.LINEAR
        #             )
        #         ],
        #         function_aggregation=ScoringFunctionAggregation.SUM
        #     )
        #
        
        # Define semantic settings
        semantic_config = SemanticConfiguration(
            name="default",
            prioritized_fields=SemanticPrioritizedFields(
                title_field=SemanticField(field_name="vendorName"),
                keywords_fields=[SemanticField(field_name="contractTitle"),
                                SemanticField(field_name="clientName")],
                content_fields=[SemanticField(field_name="content")]
            )
        )
        
        # Create semantic search configuration
        semantic_search = SemanticSearch(
            configurations=[semantic_config]
        )

        # Create the index with the defined fields and vector search settings
        index = SearchIndex(
            name=index_name,
            fields=fields,
            vector_search=vector_search,
            semantic_search=semantic_search
        )
        
        logger.info("Attempting to create or update index...")
        result = search_index_client.create_or_update_index(index)
        logger.info(f"Successfully created/updated index '{index_name}'")
        return result

    except ServiceRequestError as e:
        logger.error(f"Service request error while creating index: {str(e)}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error while creating index: {str(e)}")
        raise

if __name__ == "__main__":
    try:
        create_search_index(
            endpoint=ai_search_endpoint,
            admin_key=ai_search_admin_key,
            index_name=ai_search_index
        )
    except Exception as e:
        logger.error(f"Failed to create index: {str(e)}")

INFO:__main__:Initializing SearchIndexClient with endpoint: https://rdc-ai-search.search.windows.net
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://rdc-ai-search.search.windows.net/indexes('rdc-contacts-v1')?api-version=REDACTED'
Request method: 'GET'
Request headers:
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': '020955dc-b83e-11ef-9157-1091d1f8d990'
    'User-Agent': 'azsdk-python-search-documents/11.5.2 Python/3.12.8 (Windows-11-10.0.26100-SP0)'
No body was attached to the request


INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 404
Response headers:
    'Cache-Control': 'no-cache,no-store'
    'Pragma': 'no-cache'
    'Content-Length': '116'
    'Content-Type': 'application/json; charset=utf-8'
    'Content-Language': 'REDACTED'
    'Expires': '-1'
    'Server': 'Microsoft-IIS/10.0'
    'request-id': '020955dc-b83e-11ef-9157-1091d1f8d990'
    'elapsed-time': 'REDACTED'
    'Strict-Transport-Security': 'REDACTED'
    'Date': 'Thu, 12 Dec 2024 04:03:14 GMT'
INFO:__main__:Creating new index 'rdc-contacts-v1'...
INFO:__main__:Attempting to create or update index...
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://rdc-ai-search.search.windows.net/indexes('rdc-contacts-v1')?api-version=REDACTED'
Request method: 'PUT'
Request headers:
    'Content-Type': 'application/json'
    'Content-Length': '3102'
    'api-key': 'REDACTED'
    'Prefer': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-cli

In [5]:
from azure.search.documents.indexes.models import *

# Store the list of attributes and classes in a variable
models_list = dir()

# Check if 'SemanticRanker' is in the list
if 'SemanticRanker' in models_list:
    print("SemanticRanker is available.")
else:
    print("SemanticRanker is not found.")

# Optionally, print the entire list for reference
print("All models in azure.search.documents.indexes.models:")
print(models_list)

SemanticRanker is not found.
All models in azure.search.documents.indexes.models:
