In [1]:
import os
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI
from internal_shared.ai_models.available_models import EMBEDDING_3_LARGE

client = AzureOpenAI(
    api_key=EMBEDDING_3_LARGE.api_key,
    api_version=EMBEDDING_3_LARGE.api_version,
    azure_endpoint=EMBEDDING_3_LARGE.endpoint,
)

search_client = SearchClient(
    endpoint=os.getenv("AZURE_AI_SEARCH_ENDPOINT"),
    index_name=os.getenv("AZURE_AI_SEARCH_INDEX"),
    credential=AzureKeyCredential(os.getenv("AZURE_AI_SEARCH_API_KEY")),
)

In [2]:
import os
import json
from typing import Any, Dict
import uuid
import time

def embed_texts(texts):
    # Create embeddings for a batch of texts
    embedding_results = client.embeddings.create(
        input=texts, model=EMBEDDING_3_LARGE.model_name
    )
    #texts = [text.replace("\n", " ") for text in texts]
    return [result.embedding for result in embedding_results.data]

def create_search_document(content, embedding, metadata: Dict[str, Any]):
    # Create a unique id for each document
    document_id = str(uuid.uuid4())

    # Combine content and metadata
    document = {
        "id": document_id,
        "content": content,
        "embedding": embedding,
        "name": metadata.get("name", ""),
        "summary": metadata.get("summary", ""),
        "type": metadata.get("type", ""),
        "namespace": metadata.get("namespace", ""),
        "assembly": metadata.get("assembly", ""),
        "type_references": metadata.get("type_references", []),
        "filename": metadata.get("filename", ""),
        "chunk_id": metadata.get("chunk_id", 0),
        "total_chunks": metadata.get("total_chunks", 1),
        "expected_embedding_size": metadata.get("expected_embedding_size", 0),
    }

    return document

# Collect all documents
documents = []
contents = []
metadata_list = []
for filename in os.listdir("documents"):
    if filename.endswith(".txt"):
        interface_name = filename[:-4]

        # Read the text content
        with open(f"documents/{interface_name}.txt", "r") as f:
            content = f.read()
            contents.append(content)

        # Read the metadata
        with open(f"documents/{interface_name}.metadata.json", "r") as f:
            metadata = json.load(f)
            metadata_list.append(metadata)

# Create embeddings in batches
batch_size = 1000  # we can access around 120k tokens per minute
for i in range(0, len(contents), batch_size):
    batch_contents = contents[i:i + batch_size]
    batch_embeddings = embed_texts(batch_contents)

    for j in range(len(batch_contents)):
        document = create_search_document(batch_contents[j], batch_embeddings[j], metadata_list[i + j])
        documents.append(document)
    
    # before next iteration, wait time to avoid rate limiting
    time.sleep(30)

print(f"Embedded {len(documents)} documents")

Embedded 580 documents


In [3]:
# Upload documents to Azure AI Search; supports uploading in batches up to 1000 documents
UPLOAD_BATCH_SIZE = 1000

for i in range(0, len(documents), UPLOAD_BATCH_SIZE):
    batch = documents[i : i + UPLOAD_BATCH_SIZE]
    results = search_client.upload_documents(documents=batch)
    if all(result.succeeded for result in results):
        print(f"Uploaded batch {i // UPLOAD_BATCH_SIZE + 1} successfully.")
    else:
        # check, which results failed
        for result in results:
            if not result.succeeded:
                print(f"Failed to upload document with ID {result.key}. Error: {result.error_message}")

print(f"All {len(results)} documents uploaded.")

Uploaded batch 1 successfully.
All 580 documents uploaded.


In [None]:
delete_results = search_client.delete_documents(documents)
if all(result.succeeded for result in delete_results):
    print(f"Deleted batch {i // UPLOAD_BATCH_SIZE + 1} successfully.")
else:
    for result in delete_results:
        if not result.succeeded:
            print(f"Failed to upload document with ID {result.key}. Error: {result.error_message}")