In [38]:
pip install -q openai azure-core azure-search-documents azure-storage-blob pymupdf

In [None]:
%run Documentum/logging_utility

In [None]:
import io
import uuid
import fitz
from datetime import datetime, timedelta
from openai import AzureOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.storage.blob import BlobServiceClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SearchField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    SemanticSearch,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField
)
from azure.storage.blob import (
    BlobServiceClient,
    BlobSasPermissions,
    generate_blob_sas,
)

In [None]:
service_endpoint = ""
index_name       = "pdfidx"
key              = ""
credential       = AzureKeyCredential(key)
vector_dims       = 1536
algo_name         = "hnsw-cosine"
profile_name      = "openai-ada-profile"

In [None]:
# Define the index fields
client = SearchIndexClient(service_endpoint, credential)
fields = [
    SimpleField(name="id",   type=SearchFieldDataType.String,key=True, sortable=True, filterable=True, facetable=True),
    SimpleField(name="file_name", type=SearchFieldDataType.String),
    
    SimpleField(name="page_number",type=SearchFieldDataType.Int32,sortable=True,filterable=True,facetable=False),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="content_vector",type=SearchFieldDataType.Collection(SearchFieldDataType.Single),searchable=True,vector_search_dimensions=1536,vector_search_profile_name="my-vector-config"),
    SimpleField(name="storage_url",type=SearchFieldDataType.String,filterable=False,facetable=False,sortable=False)
]
 
vector_search = VectorSearch(
    profiles=[VectorSearchProfile(name="my-vector-config",
                                algorithm_configuration_name="my-algorithms-config")],
    algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config")],
)

# Define semantic configuration
semantic_config = SemanticConfiguration(
    name="semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="file_name"),
        content_fields=[SemanticField(field_name="content")]
    )
)

# Add semantic search to the index
semantic_search = SemanticSearch(
    default_configuration_name="semantic-config",
    configurations=[semantic_config]
)

# Update the index definition
index = SearchIndex(
    name=index_name,
    fields=fields,
    vector_search=vector_search,
    semantic_search=semantic_search
)

client.create_or_update_index(index)

In [None]:
# ─── 1. CONFIG ────────────────────────────────────────────────────────────────
AZURE_OPENAI_ENDPOINT  = ""
AZURE_OPENAI_KEY       = ""
AZURE_OPENAI_API_VER   = "2024-10-21"
EMBED_DEPLOYMENT       = "emb"
 
SEARCH_ENDPOINT        = ""
SEARCH_API_KEY         = ""
SEARCH_INDEX_NAME      = "pdfidx"
 
ADLS_CONNECTION_STRING = ""  # blob endpoint
ADLS_CONTAINER_NAME    = "raw"
ADLS_DIR_PREFIX        = "pdf/"
ADLS_STORAGE_ACCOUNT  = ""
ADLS_STORAGE_KEY = ""
 
TEXT_MIN_CHARS         = 5
 
# ─── 2. CLIENTS ───────────────────────────────────────────────────────────────
openai_client = AzureOpenAI(
    api_key        = AZURE_OPENAI_KEY,
    azure_endpoint = AZURE_OPENAI_ENDPOINT,
    api_version    = AZURE_OPENAI_API_VER,
)
 
search_client = SearchClient(
    endpoint   = SEARCH_ENDPOINT,
    index_name = SEARCH_INDEX_NAME,
    credential = AzureKeyCredential(SEARCH_API_KEY)
)
 
blob_service  = BlobServiceClient.from_connection_string(ADLS_CONNECTION_STRING)
 
container     = blob_service.get_container_client(ADLS_CONTAINER_NAME)
 
# ─── 3. HELPERS ───────────────────────────────────────────────────────────────
def create_embedding(text: str, model: str = EMBED_DEPLOYMENT) -> list[float]:
    """Return the embedding vector for `text`."""
    return openai_client.embeddings.create(input=[text], model=model).data[0].embedding
 
def pdf_pages_from_bytes(pdf_bytes: bytes):
    """Yield (page_no, text) tuples for each non-empty page in the PDF binary."""
    with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
        for i, page in enumerate(doc):
            txt = page.get_text("text").strip()
            if txt:
                yield i + 1, txt
def make_sas_url(file_name: str) -> str:
    sas = generate_blob_sas(
        account_name=ADLS_STORAGE_ACCOUNT,
        container_name=ADLS_CONTAINER_NAME,
        blob_name=file_name,
        account_key=ADLS_STORAGE_KEY,
        permission=BlobSasPermissions(read=True),
        expiry=datetime.utcnow() + timedelta(days=7)
    )
    return f"https://{ADLS_STORAGE_ACCOUNT}.blob.core.usgovcloudapi.net/{ADLS_CONTAINER_NAME}/{file_name}?{sas}"
 
# ─── 4. MAIN ──────────────────────────────────────────────────────────────────
def main():
    uploaded = skipped = 0
    start = datetime.utcnow()
    print("➜ Scanning ADLS container for PDFs …")
    total_pdf_parsed = 0
    total_pdf = 0
    page_parsed = 0
    pdf_size = 0
    for blob in container.list_blobs(name_starts_with=ADLS_DIR_PREFIX):
        total_pdf += 1
        if not blob.name.lower().endswith(".pdf"):
            continue
 
        print(f"\n➜ Processing {blob.name}")
        pdf_bytes = container.download_blob(blob.name).readall()
        pdf_size += len(pdf_bytes)
        for page_no, text in pdf_pages_from_bytes(pdf_bytes):
            if len(text) < TEXT_MIN_CHARS:
                skipped += 1
                continue
 
            doc = {
                "id":             str(uuid.uuid4()),
                "file_name":      blob.name,
                "page_number":    page_no,
                "content":        text,
                "content_vector": create_embedding(text),
                "storage_url": make_sas_url(blob.name)
            }
            result = search_client.upload_documents([doc])[0]
            status = "✓" if result.succeeded else "✗"
            if(status == "✓"):
                page_parsed += 1
            uploaded += int(result.succeeded)
        if(page_parsed > 0):
            total_pdf_parsed +=1
        print(f"Total page parsed {page_parsed}")
        page_parsed = 0

    end = datetime.utcnow()
    pid = str(uuid.uuid4())
    stage = "Bronze to Silver"
    status = "Convert PDF to AI"
    size = pdf_size
    log_activity(pid, stage, status,start,end, size)
    print(f"\nDone. {uploaded} pages indexed, {skipped} pages skipped.")
    print(f"Total PDF processed: {total_pdf_parsed}\n Total PDF in Raw: {total_pdf}" )
 
if __name__ == "__main__":
    main()