In [1]:
pip install -q openai azure-core azure-search-documents azure-storage-blob tika

In [None]:
%run Documentum/logging_utility

In [2]:
import io
import uuid
import json
from openai import AzureOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.storage.blob import BlobServiceClient 
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SearchField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    SemanticSearch,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField
)
from azure.storage.blob import (

    BlobServiceClient, generate_blob_sas, BlobSasPermissions

)

import textwrap

from datetime import datetime, timedelta

from tika import parser  

In [None]:
service_endpoint = ""
index_name       = ""
key              = ""
credential       = AzureKeyCredential(key)
vector_dims       = 1536
algo_name         = "hnsw-cosine"
profile_name      = "openai-ada-profile"

In [4]:
# Define the index fields
client = SearchIndexClient(service_endpoint, credential)
fields = [
    SimpleField(name="id",   type=SearchFieldDataType.String,key=True, sortable=True, filterable=True, facetable=True),
    SimpleField(name="file_name", type=SearchFieldDataType.String),
    
    SimpleField(name="page_number",type=SearchFieldDataType.Int32,sortable=True,filterable=True,facetable=False),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="content_vector",type=SearchFieldDataType.Collection(SearchFieldDataType.Single),searchable=True,vector_search_dimensions=1536,vector_search_profile_name="my-vector-config"),
    SimpleField(name="storage_url",type=SearchFieldDataType.String,filterable=False,facetable=False,sortable=False)
]
 
vector_search = VectorSearch(
    profiles=[VectorSearchProfile(name="my-vector-config",
                                algorithm_configuration_name="my-algorithms-config")],
    algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config")],
)

# Define semantic configuration
semantic_config = SemanticConfiguration(
    name="semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="file_name"),
        content_fields=[SemanticField(field_name="content")]
    )
)

# Add semantic search to the index
semantic_search = SemanticSearch(
    default_configuration_name="semantic-config",
    configurations=[semantic_config]
)

# Update the index definition
index = SearchIndex(
    name=index_name,
    fields=fields,
    vector_search=vector_search,
    semantic_search=semantic_search
)

client.create_or_update_index(index)

In [None]:
# ─── 1. CONFIG ────────────────────────────────────────────────────────────────
AZURE_OPENAI_ENDPOINT  = ""
AZURE_OPENAI_KEY       = ""
AZURE_OPENAI_API_VER   = "2024-10-21"
EMBED_DEPLOYMENT       = ""
 
SEARCH_ENDPOINT        = ""
SEARCH_API_KEY         = ""
SEARCH_INDEX_NAME      = "jsonidx"
 
ADLS_CONNECTION_STRING = ""  # blob endpoint
ADLS_CONTAINER_NAME    = "raw"
ADLS_DIR_PREFIX        = "doc/"
ADLS_STORAGE_ACCOUNT  = "czvgngendif000000dsta001"
ADLS_STORAGE_KEY = ""

 
TEXT_MIN_CHARS         = 5                 # too-short chunks are skipped

CHUNK_SIZE             = 2_000               # ≈ 700–800 tokens
 
# ─── 2. CLIENTS ───────────────────────────────────────────────────────────────
 
openai_client = AzureOpenAI(

    api_key        = AZURE_OPENAI_KEY,

    azure_endpoint = AZURE_OPENAI_ENDPOINT,

    api_version    = AZURE_OPENAI_API_VER,

)
 
search_client = SearchClient(

    endpoint   = SEARCH_ENDPOINT,

    index_name = SEARCH_INDEX_NAME,

    credential = AzureKeyCredential(SEARCH_API_KEY)

)
 
blob_service = BlobServiceClient.from_connection_string(ADLS_CONNECTION_STRING)

container     = blob_service.get_container_client(ADLS_CONTAINER_NAME)
 
# ─── 3. HELPERS ───────────────────────────────────────────────────────────────
             # downloads Apache Tika JAR at first call
 
def create_embedding(text: str, model: str = EMBED_DEPLOYMENT) -> list[float]:

    """Return an embedding vector for `text`."""

    return openai_client.embeddings.create(

        input=[text], model=model

    ).data[0].embedding
 
def doc_chunks_from_bytes(doc_bytes: bytes, *, size: int = CHUNK_SIZE):

    """

    Yield (chunk_no, text) tuples for the .doc binary.

    Chunks are plain-text slices of ≈`size` characters.

    """

    raw = parser.from_buffer(doc_bytes).get("content", "") or ""

    clean = " ".join(raw.split())        # collapse whitespace/newlines

    for i in range(0, len(clean), size):

        chunk = clean[i : i + size].strip()

        if chunk:

            yield (i // size + 1), chunk
 
def make_sas_url(blob_name: str) -> str:

    sas = generate_blob_sas(

        account_name = ADLS_STORAGE_ACCOUNT,

        container_name = ADLS_CONTAINER_NAME,

        blob_name = blob_name,

        account_key = ADLS_STORAGE_KEY,

        permission = BlobSasPermissions(read=True),

        expiry = datetime.utcnow() + timedelta(days=7),

    )

    return (

        f"https://{ADLS_STORAGE_ACCOUNT}.blob.core.usgovcloudapi.net/"

        f"{ADLS_CONTAINER_NAME}/{blob_name}?{sas}"

    )
 
# ─── 4. MAIN ──────────────────────────────────────────────────────────────────

def main():

    uploaded = skipped = 0
    total_doc_parsed = 0
    total_doc = 0
    page_parsed = 0
    doc_size = 0
    start = datetime.utcnow()
 
    print("➜ Scanning ADLS container for .doc files …")
    for blob in container.list_blobs(name_starts_with=ADLS_DIR_PREFIX):
        if not blob.name.lower().endswith(".doc"):      # legacy Office format
            continue
        print(f"\n➜ Processing {blob.name}")
        doc_bytes = container.download_blob(blob.name).readall()
        total_doc += 1
        doc_size += len(doc_bytes)
        for chunk_no, text in doc_chunks_from_bytes(doc_bytes):
            if len(text) < TEXT_MIN_CHARS:
                skipped += 1
                continue
            record = {
                "id"            : str(uuid.uuid4()),
                "file_name"     : blob.name,
                "page_number"   : chunk_no,             # “page” == chunk index
                "content"       : text,
                "content_vector": create_embedding(text),
                "storage_url"   : make_sas_url(blob.name),
            }
            result = search_client.upload_documents([record])[0]
            status = "✓" if result.succeeded else "✗"
            if(status == "✓"):
                page_parsed += 1
            uploaded += int(result.succeeded)
        if(page_parsed > 0):
            total_doc_parsed +=1
        print(f"Total page parsed {page_parsed}")
        page_parsed = 0
 
    print(f"\nDone. {uploaded} chunks indexed, {skipped} chunks skipped.")
    print(f"Total DOCX processed: {total_doc_parsed}\n Total DOCX in Raw: {total_doc}" )
    end = datetime.utcnow()
    pid = str(uuid.uuid4())
    stage = "Bronze to Silver"
    status = "Convert Doc to Ai"
    size = doc_size
    log_activity(pid, stage, status,start,end, size)
    
 
if __name__ == "__main__":
    main()

 