# Submit domain knowledge to Azure AI

In [None]:
import os
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI
from internal_shared.models.ai import EMBEDDING_3_LARGE

client = AzureOpenAI(
    api_key=EMBEDDING_3_LARGE.api_key,
    api_version=EMBEDDING_3_LARGE.api_version,
    azure_endpoint=EMBEDDING_3_LARGE.endpoint,
)

search_client = SearchClient(
    endpoint=os.getenv("AZURE_AI_SEARCH_ENDPOINT"),
    index_name=os.getenv("AZURE_AI_SEARCH_INDEX"),
    credential=AzureKeyCredential(os.getenv("AZURE_AI_SEARCH_API_KEY")),
)

Submit business logic

In [None]:
import os
import json
from typing import Any, Dict
import uuid
import time


def embed_texts(texts):
    # Create embeddings for a batch of texts
    embedding_results = client.embeddings.create(
        input=texts, model=EMBEDDING_3_LARGE.model_name
    )
    return [result.embedding for result in embedding_results.data]


def create_search_document(content, embedding, metadata: Dict[str, Any]):
    # Create a unique id for each document
    document_id = str(uuid.uuid4())

    # Combine content and metadata
    document = {
        "id": document_id,
        "content": content,
        "embedding": embedding,
        "name": metadata.get("name", ""),
        "summary": metadata.get("summary", ""),
        "type": metadata.get("type", ""),
        "namespace": metadata.get("namespace", ""),
        "assembly": metadata.get("assembly", ""),
        "type_references": metadata.get("type_references", []),
        "filename": metadata.get("filename", ""),
        "chunk_id": metadata.get("chunk_id", 0),
        "total_chunks": metadata.get("total_chunks", 1),
        "expected_embedding_size": metadata.get("expected_embedding_size", 0),
    }

    return document


# Collect all documents
documents = []
contents = []
metadata_list = []
for filename in os.listdir("documents"):
    if filename.endswith(".txt"):
        interface_name = filename[:-4]

        # Read the text content
        with open(f"documents/{interface_name}.txt", "r") as f:
            content = f.read()
            contents.append(content)

        # Read the metadata
        with open(f"documents/{interface_name}.metadata.json", "r") as f:
            metadata = json.load(f)
            metadata_list.append(metadata)

# Create embeddings in batches
batch_size = 1000  # we can access around 120k tokens per minute
embedding_batches = (len(contents) + batch_size - 1) // batch_size

for i in range(0, len(contents), batch_size):
    batch_contents = contents[i : i + batch_size]
    batch_embeddings = embed_texts(batch_contents)

    for j in range(len(batch_contents)):
        document = create_search_document(
            batch_contents[j], batch_embeddings[j], metadata_list[i + j]
        )
        documents.append(document)

    # before next iteration, wait time to avoid rate limiting
    if (i // embedding_batches + 1) < embedding_batches:
        time.sleep(30)

print(f"Embedded {len(documents)} documents")

In [None]:
# Upload documents to Azure AI Search; supports uploading in batches up to 1000 documents
UPLOAD_BATCH_SIZE = 1000
upload_batches = (len(documents) + UPLOAD_BATCH_SIZE - 1) // UPLOAD_BATCH_SIZE

for i in range(0, len(documents), UPLOAD_BATCH_SIZE):
    batch = documents[i : i + UPLOAD_BATCH_SIZE]
    results = search_client.upload_documents(documents=batch)
    if all(result.succeeded for result in results):
        print(f"Uploaded batch {i // UPLOAD_BATCH_SIZE + 1} successfully.")
    else:
        # check, which results failed
        for result in results:
            if not result.succeeded:
                print(
                    f"Failed to upload document with ID {result.key}. Error: {result.error_message}"
                )

    # don't hit rate limits
    if (i // UPLOAD_BATCH_SIZE + 1) < upload_batches:
        time.sleep(30)

print(f"All {len(results)} documents uploaded.")

In [None]:
delete_results = search_client.delete_documents(documents)
if all(result.succeeded for result in delete_results):
    print(f"Deleted batch {i // UPLOAD_BATCH_SIZE + 1} successfully.")
else:
    for result in delete_results:
        if not result.succeeded:
            print(f"Failed to upload document with ID {result.key}. Error: {result.error_message}")

Submit formula functions

In [1]:
import os
import json
import uuid
from typing import List, Dict, Any
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI
from internal_shared.models.documents import DevExpressFunction
from internal_shared.models.ai import EMBEDDING_3_LARGE

_embed_client = AzureOpenAI(
    api_key=EMBEDDING_3_LARGE.api_key,
    api_version=EMBEDDING_3_LARGE.api_version,
    azure_endpoint=EMBEDDING_3_LARGE.endpoint,
)

_search_client = SearchClient(
    endpoint=os.getenv("AZURE_AI_SEARCH_ENDPOINT"),
    index_name="mergedfunctionindex_v2",
    credential=AzureKeyCredential(os.getenv("AZURE_AI_SEARCH_API_KEY")),
)

def load_json(file_path: str) -> List[DevExpressFunction]:
    with open(file_path, "r") as file:
        return [DevExpressFunction(**item) for item in json.load(file)]

def convert_to_search_document(embedding: List[float], data: DevExpressFunction) -> Dict[str, Any]:
    return {
        "id": str(uuid.uuid4()),
        "content": data.description,
        "embedding": embedding,
        "name": data.name,
        "example": data.example,
        "category": data.category,
        "source": data.source,
        "keywords": data.keywords or [],
    }

def embed_all_text(text: List[str]) -> List[List[float]]:
    embeddings = _embed_client.embeddings.create(
        input=text, model=EMBEDDING_3_LARGE.model_name
    )
    return [result.embedding for result in embeddings.data]

Read custom and native function and convert them to search documents

In [2]:
documents = load_json("/workspace/data/functions/data.custom.json")
native_documents = load_json("/workspace/data/functions/data.json")
documents.extend(native_documents)
descriptions = [document.description for document in documents]

len(descriptions)

208

Embed and convert the code to search documents in batches

In [3]:
import time

search_documents = []
embedding_batch_size = 1000
total_embedding_batches = (len(descriptions) + embedding_batch_size - 1) // embedding_batch_size

for i in range(0, len(descriptions), embedding_batch_size):
    batch_descriptions = descriptions[i : i + embedding_batch_size]
    batch_embeddings = embed_all_text(batch_descriptions)

    for j in range(len(batch_descriptions)):
        document = convert_to_search_document(batch_embeddings[j], documents[i + j])
        search_documents.append(document)

    # before next iteration, wait time to avoid rate limiting
    if (i // embedding_batch_size + 1) < total_embedding_batches:
        time.sleep(30)

len(search_documents)

208

Upload the created search documents in batches

In [4]:
import time

upload_batch_size = 1000
total_upload_batches = (
    len(search_documents) + upload_batch_size - 1
) // upload_batch_size

for i in range(0, len(search_documents), upload_batch_size):
    batch = search_documents[i : i + upload_batch_size]
    results = _search_client.upload_documents(documents=batch)
    if all(result.succeeded for result in results):
        print(f"Uploaded batch {i // upload_batch_size + 1} successfully.")
    else:
        for result in results:
            if not result.succeeded:
                print(
                    f"Failed to upload document with ID {result.key}. Error: {result.error_message}"
                )

    # ensure we don't hit the rate limit
    if (i // upload_batch_size + 1) < total_upload_batches:
        time.sleep(30)

print(f"All {len(results)} documents uploaded.")

Uploaded batch 1 successfully.
All 208 documents uploaded.
