# Submit domain knowledge to Azure AI

In [1]:
import os

# os.getenv("AZURE_AI_SEARCH_INDEX") was domain_knowledge with old chunks
INDEX_NAME = "domain_knowledge_v2"
ROOT_FOLDER_NAME = "/workspace/data/business_objects/"
DOCUMENTS_FOLDER = os.path.join(ROOT_FOLDER_NAME, "documents")

UPLOAD_BATCH_SIZE = 1000

In [2]:
import os
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI
from internal_shared.models.ai import EMBEDDING_3_LARGE

client = AzureOpenAI(
    api_key=EMBEDDING_3_LARGE.api_key,
    api_version=EMBEDDING_3_LARGE.api_version,
    azure_endpoint=EMBEDDING_3_LARGE.endpoint,
)

search_client = SearchClient(
    endpoint=os.getenv("AZURE_AI_SEARCH_ENDPOINT"),
    index_name=INDEX_NAME,
    credential=AzureKeyCredential(os.getenv("AZURE_AI_SEARCH_API_KEY")),
)

Submit business logic

In [3]:
import os
import json
from typing import Any, Dict
import uuid
import time


def embed_texts(texts):
    # Create embeddings for a batch of texts
    embedding_results = client.embeddings.create(
        input=texts, model=EMBEDDING_3_LARGE.model_name
    )
    return [result.embedding for result in embedding_results.data]


def create_search_document(content, embedding, metadata: Dict[str, Any]):
    # Create a unique id for each document
    document_id = str(uuid.uuid4())

    # Combine content and metadata
    document = {
        "id": document_id,
        "content": content,
        "embedding": embedding,
        "name": metadata.get("name", ""),
        "summary": metadata.get("summary", ""),
        "type": metadata.get("type", ""),
        "namespace": metadata.get("namespace", ""),
        "assembly": metadata.get("assembly", ""),
        "type_references": metadata.get("type_references", []),
        "filename": metadata.get("filename", ""),
        "chunk_id": metadata.get("chunk_id", 0),
        "total_chunks": metadata.get("total_chunks", 1),
    }

    return document


# Collect all documents
documents = []
contents = []
metadata_list = []
for filename in os.listdir(DOCUMENTS_FOLDER):
    if filename.endswith(".txt"):
        interface_name = filename[:-4]

        # Read the text content
        with open(os.path.join(DOCUMENTS_FOLDER, f"{interface_name}.txt"), "r") as f:
            content = f.read()
            contents.append(content)

        # Read the metadata
        with open(os.path.join(DOCUMENTS_FOLDER, f"{interface_name}.metadata.json"), "r") as f:
            metadata = json.load(f)
            metadata_list.append(metadata)

# Create embeddings in batches
batch_size = 250  # we can access around 120k tokens per minute
embedding_batches = (len(contents) + batch_size - 1) // batch_size

for i in range(0, len(contents), batch_size):
    batch_contents = contents[i : i + batch_size]
    batch_embeddings = embed_texts(batch_contents)

    for j in range(len(batch_contents)):
        document = create_search_document(
            batch_contents[j], batch_embeddings[j], metadata_list[i + j]
        )
        documents.append(document)

    # before next iteration, wait time to avoid rate limiting
    if (i // embedding_batches + 1) < embedding_batches:
        time.sleep(30)

print(f"Embedded {len(documents)} documents")

Embedded 587 documents


Upload documents to Azure AI Search; supports uploading in batches up to 1000 documents

In [4]:
upload_batches = (len(documents) + UPLOAD_BATCH_SIZE - 1) // UPLOAD_BATCH_SIZE

for i in range(0, len(documents), UPLOAD_BATCH_SIZE):
    batch = documents[i : i + UPLOAD_BATCH_SIZE]
    results = search_client.upload_documents(documents=batch)
    if all(result.succeeded for result in results):
        print(f"Uploaded batch {i // UPLOAD_BATCH_SIZE + 1} successfully.")
    else:
        # check, which results failed
        for result in results:
            if not result.succeeded:
                print(
                    f"Failed to upload document with ID {result.key}. Error: {result.error_message}"
                )

    # don't hit rate limits
    if (i // UPLOAD_BATCH_SIZE + 1) < upload_batches:
        time.sleep(30)

print(f"All {len(results)} documents uploaded.")

Uploaded batch 1 successfully.
All 587 documents uploaded.
