In this example we are going to take a look at a concept called semantic chunking and how this can be acheived with Azure AI Document Intelligence which would be using with a RAG workflow.

In [5]:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
from langchain.text_splitter import MarkdownHeaderTextSplitter
from azure.storage.blob import BlobSasPermissions
from azure.storage.blob import generate_blob_sas
from datetime import datetime, timedelta, UTC  # Added UTC

import os
from dotenv import load_dotenv
from pathlib import Path

# Get root directory path
root_dir = Path().absolute().parent
env_path = root_dir / '.env'

# Load .env from root
load_dotenv(dotenv_path=env_path)
print(f"Loaded .env from {env_path}")

# Azure AI Document Intelligence setup
file_path = "path/to/your/document.pdf"
endpoint = os.getenv('FORM_RECOGNIZER_ENDPOINT')
key = os.getenv('FORM_RECOGNIZER_KEY')

# Azure Storage settings
storage_account_name = os.getenv('STORAGE_ACCOUNT_NAME')
storage_account_key = os.getenv('STORAGE_ACCOUNT_KEY')
container_name = "source"
input_filename = "VendorAgreement-Fabrikam-5004432.pdf"

# Azure Blob Storage setup (assuming the document is already in a blob with a public URL)
document_url = "https://your_storage_account.blob.core.windows.net/your_container/your_document.pdf"

# Initialize the Document Intelligence client
doc_intelligence_client = DocumentIntelligenceClient(endpoint, AzureKeyCredential(key))



def generate_sas_url(blob_name: str) -> str:
    """
    Generate a full URL with SAS token for a specific blob
    """
    # Define the permissions for the SAS token
    sas_permissions = BlobSasPermissions(read=True)
    
    # Set token expiry time using timezone-aware datetime
    expiry_time = datetime.now(UTC) + timedelta(hours=1)
    
    # Generate the SAS token
    sas_token = generate_blob_sas(
        account_name=storage_account_name,
        account_key=storage_account_key,
        container_name=container_name,
        blob_name=blob_name,
        permission=sas_permissions,
        expiry=expiry_time
    )
    
    # Construct the full URL including the SAS token
    blob_url = f"https://{storage_account_name}.blob.core.windows.net/{container_name}/{blob_name}?{sas_token}"
    
    return blob_url

document_url = generate_sas_url(input_filename)

# Begin analysis with the document URL
poller = doc_intelligence_client.begin_analyze_document(
    model_id="prebuilt-layout",
    analyze_request={"urlSource": document_url}
)

# Get the result of the analysis
result = poller.result()

# Extract the content as markdown
markdown_content = result.content
print(f"\nMarkdown Content:\n{markdown_content}\n")

# Define headers for splitting
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

# Initialize the MarkdownHeaderTextSplitter
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# Split the document into chunks based on markdown headers
splits = text_splitter.split_text(markdown_content)

# Print the splits
for split in splits:
    print(f"Header: {split.metadata.get('header', 'No Header')}")
    print(f"Content: {split.page_content[:100]}...")  # Print first 100 
    print("---")

Loaded .env from c:\Users\rickcau\source\repos\vendor-contracts-gen-ai\.env

Markdown Content:
Vendor Contractor Agreement
Contract ID: 5004432
This Vendor Contractor Agreement ("Agreement") is entered into as of 12/7/2024 by and between:
. [Contoso Elite] (the "Company"), with its principal office located at [Address], and
· [Fabrikam Services] (the "Vendor"), with its principal office located at [Address].
1. Services
The Vendor agrees to provide the following services:
· 200 Hours of Developer Support for GenAl Contracts Project
The services must be completed by [Feb 20, 2024], as outlined in Exhibit A (Scope of Work).
2. Compensation
The Company will pay the Vendor [$20,000 US] upon completion of services. Payments will be made within [10] business days after receiving an invoice from the Vendor.
3. Term
This Agreement begins on [12/14/2024] and ends on [Feb 20, 2024].
4. Confidentiality
The Vendor agrees to maintain the confidentiality of any proprietary information provided by th