In [None]:
pip install openai azure-core azure-search-documents azure-storage-blob

In [None]:
import io
import uuid
import json
from openai import AzureOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.storage.blob import BlobServiceClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SearchField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    SemanticSearch,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField
)
 

In [None]:
#                                                                     AI SEARCH CREDS
SEARCH_ENDPOINT = ""
index_name       = "jsonidx"
SEARCH_API_KEY   = ""
credential       = AzureKeyCredential(SEARCH_API_KEY)
vector_dims       = 1536
algo_name         = "hnsw-cosine"
profile_name      = "openai-ada-profile"

# OPEN CRED CONFIG
AZURE_OPENAI_ENDPOINT  = ""
AZURE_OPENAI_KEY       = ""
AZURE_OPENAI_API_VER   = "2024-10-21"
EMBED_DEPLOYMENT       = "emb"

#
SEARCH_ENDPOINT        = ""
SEARCH_API_KEY         = ""
SEARCH_INDEX_NAME      = "jsonidx"

#                                         BLOB STORAGE 

ADLS_CONNECTION_STRING = ""  # blob endpoint
ADLS_CONTAINER_NAME    = "bronze"
ADLS_DIR_PREFIX        = "pdf/"

In [None]:
#                                                                     DEFINING AI SEARCH INDEX
# Define the index fields
client = SearchIndexClient(SEARCH_ENDPOINT, credential)
fields = [
    SimpleField(name="id",   type=SearchFieldDataType.String,key=True, sortable=True, filterable=True, facetable=True),
    SimpleField(name="file_name", type=SearchFieldDataType.String),
   
    SimpleField(name="page_number",type=SearchFieldDataType.Int32,sortable=True,filterable=True,facetable=False),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="content_vector",type=SearchFieldDataType.Collection(SearchFieldDataType.Single),searchable=True,vector_search_dimensions=1536,vector_search_profile_name="my-vector-config"),
]
 
vector_search = VectorSearch(
    profiles=[VectorSearchProfile(name="my-vector-config",
                                algorithm_configuration_name="my-algorithms-config")],
    algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config")],
)
 
# Define semantic configuration
semantic_config = SemanticConfiguration(
    name="semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="file_name"),
        content_fields=[SemanticField(field_name="content")]
    )
)
 
# Add semantic search to the index
semantic_search = SemanticSearch(
    default_configuration_name="semantic-config",
    configurations=[semantic_config]
)
 
# Update the index definition
index = SearchIndex(
    name=index_name,
    fields=fields,
    vector_search=vector_search,
    semantic_search=semantic_search
)
 
client.create_or_update_index(index)

In [None]:
# ─── 2. CLIENTS ───────────────────────────────────────────────────────────────
openai_client = AzureOpenAI(
    api_key        = AZURE_OPENAI_KEY,
    azure_endpoint = AZURE_OPENAI_ENDPOINT,
    api_version    = AZURE_OPENAI_API_VER,
)
 
search_client = SearchClient(
    endpoint   = SEARCH_ENDPOINT,
    index_name = SEARCH_INDEX_NAME,
    credential = AzureKeyCredential(SEARCH_API_KEY)
)
 
blob_service  = BlobServiceClient.from_connection_string(ADLS_CONNECTION_STRING)
 
container     = blob_service.get_container_client(ADLS_CONTAINER_NAME)
 
def create_embedding(text: str, model: str = EMBED_DEPLOYMENT) -> list[float]:
 
    return openai_client.embeddings.create(input=[text], model=model).data[0].embedding
 
def page_records_from_bytes(json_bytes: bytes):
    """
    Yield (page_no, text) tuples where `text` contains:
      • the joined `lines[]`
      • every table rendered as "col1: val1 | col2: val2" per row
    """
    data = json.loads(json_bytes.decode("utf-8"))
    if not isinstance(data, dict) or "pages" not in data:
        return                                              # nothing to emit
 
    for page in data["pages"]:
        page_no = page.get("page_number", 0)
 
        # ─── 1. Extract lines ────────────────────────────────────────────────
        lines   = page.get("lines", [])
        text_blocks = ["\n".join(str(l).strip() for l in lines if str(l).strip())]
 
        # ─── 2. Extract tables ──────────────────────────────────────────────
        for tbl in page.get("tables", []):
            for row in tbl.get("rows", []):
                if isinstance(row, dict):
                    # "key1: val1 | key2: val2" …
                    row_txt = " | ".join(f"{k}: {v}" for k, v in row.items())
                    text_blocks.append(row_txt.strip())
 
        # ─── 3. Final block for this page ───────────────────────────────────
        full_text = "\n".join(t for t in text_blocks if t).strip()
        if full_text:
            yield page_no, full_text
 
# ─── MAIN (only the loop body changes) ────────────────────────────────────────
 
def main():
 
    uploaded = skipped = 0
 
    print("➜ Scanning ADLS container for JSON files …")
 
    for blob in container.list_blobs(name_starts_with=ADLS_DIR_PREFIX):
 
        if not blob.name.lower().endswith(".json"):
 
            continue
 
        print(f"\n➜ Processing {blob.name}")
 
        json_bytes = container.download_blob(blob.name).readall()
 
        for page_no, text in page_records_from_bytes(json_bytes):
 
            if len(text) < TEXT_MIN_CHARS:
 
                skipped += 1
 
                continue
 
            doc = {
 
                "id":             str(uuid.uuid4()),
 
                "file_name":      blob.name,
 
                "page_number":    page_no,
 
                "content":        text,
 
                "content_vector": create_embedding(text)
 
            }
 
            result  = search_client.upload_documents([doc])[0]
 
            status  = "✓" if result.succeeded else "✗"
 
            print(f"  Page {page_no:>3}: {status}")
 
            uploaded += int(result.succeeded)
 
    print(f"\nDone. {uploaded} pages indexed, {skipped} pages skipped.")
 
if __name__ == "__main__":
 
    main()