tbd

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions
from datetime import datetime, timedelta
import os
from dotenv import load_dotenv
from pathlib import Path

# Get root directory path
root_dir = Path().absolute().parent
env_path = root_dir / '.env'

# Load .env from root
load_dotenv(dotenv_path=env_path)
print(f"Loaded .env from {env_path}")
# Access variables
# Azure Storage settings

storage_account_name = os.getenv("STORAGE_ACCOUNT_NAME")  
storage_account_key = os.getenv("STORAGE_ACCOUNT_KEY")  # Add your storage account key here
container_name = "source"

ai_search_endpoint = os.environ["AZURE_SEARCH_ENDPOINT"]
ai_search_key = os.environ["AZURE_SEARCH_KEY"]
ai_search_admin_key = os.environ["AZURE_SEARCH_ADMIN_KEY"]
ai_search_index = "rdc-contacts-v1"

print(f"storage_account_name: {  storage_account_name }")
print(f"storage acct Key: {  storage_account_key[:4] + '*' * 5 + storage_account_key[-4:] }")
print(f"container_name: {container_name}")
print(f"ai_search_endpoint: {ai_search_endpoint}")
print(f"ai_search_key: {ai_search_key[:4] + '*' * 5 + ai_search_key[-4:]}")
print(f"ai_search_index: {ai_search_index}")

Let's load some very important data from the .env and create a couple of references to our LLMs for use later.

In [None]:
from azure.core.credentials import AzureKeyCredential
from langchain_openai import AzureChatOpenAI
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.storage.blob import BlobServiceClient
from azure.storage.blob import generate_blob_sas
from azure.storage.blob import BlobSasPermissions
from azure.search.documents import SearchClient  
from openai import AzureOpenAI 
from datetime import datetime, timedelta, UTC  # Added UTC
import json
import hashlib

# Azure Document Intelligence settings
endpoint = os.getenv('FORM_RECOGNIZER_ENDPOINT')
key = os.getenv('FORM_RECOGNIZER_KEY')

# Azure Storage settings
storage_account_connection_string = os.getenv('STORAGE_ACCOUNT_CONNECTION_STRING')
storage_account_name = os.getenv('STORAGE_ACCOUNT_NAME')
storage_account_key = os.getenv('STORAGE_ACCOUNT_KEY')
container_name = "source"

# Azure OpenAI
aoai_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
aoai_key = os.getenv("AZURE_OPENAI_API_KEY")
aoai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")

aoai_client = AzureOpenAI(
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
        api_key=os.getenv("AZURE_OPENAI_KEY"),  
        api_version="2023-05-15"
        )

primary_llm = AzureChatOpenAI(
    azure_deployment=aoai_deployment,
    api_version="2024-05-01-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=aoai_key,
    azure_endpoint=aoai_endpoint
)

primary_llm_json = AzureChatOpenAI(
    azure_deployment=aoai_deployment,
    api_version="2024-05-01-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=aoai_key,
    azure_endpoint=aoai_endpoint,
    model_kwargs={"response_format": {"type": "json_object"}}
)

search_client = SearchClient(ai_search_endpoint, ai_search_index, AzureKeyCredential(ai_search_admin_key))

Let's create the prompt which the LLM will use to extract the field data for the index, this will be used later.

In [None]:
contract_extraction_prompt = """You are an AI assistant. Your job is to read the input contract, 
and output certain info in valid JSON format. Here is what you should be extracting:
1. id - unique identifier for the document
2. contractId - unique identifier for the contract
3. vendorName - name of the vendor/supplier/contractor
4. clientName - name of the client/customer
5. contractTitle - title of the contract document
6. effectiveDate - when the contract becomes effective
7. endDate - when the contract expires
8. signingDate - when the contract was signed
9. status - current status of the contract (e.g., Active, Expired, Pending)
10. compensation - monetary value of the contract
11. terminationTerms - terms of termination
12. paymentTerms - terms of payment
13. currency - currency used in the contract
14. parentContractId - ID of the parent contract (if this is an amendment)
15. amendmentNumber - amendment number (if applicable)
16. sourceFileName - name of the source file

#Examples#
User: MASTER SERVICES AGREEMENT
This Master Services Agreement (the "Agreement") is made effective as of January 15, 2024 (the "Effective Date"), by and between:
TechCorp Solutions Inc. ("Vendor")
123 Tech Lane
Silicon Valley, CA 94025
and
Global Enterprise Ltd. ("Client")
456 Business Park
New York, NY 10001
Contract ID: MSA-2024-001
1. SERVICES
The Vendor agrees to provide software development services as outlined in Exhibit A.
2. TERM
This Agreement shall commence on the Effective Date and continue for a period of 24 months, ending on January 15, 2026.
3. COMPENSATION
Client agrees to pay Vendor a total of $250,000 USD for the services rendered.
4. PAYMENT TERMS
Payment shall be made in monthly installments of $10,416.67, due within 30 days of invoice date.
Signed and executed on: January 10, 2024
Status: Active
File: MSA_TechCorp_Global_2024.pdf

Assistant: {
'id': 'DOC-20240115-001',
'contractId': 'MSA-2024-001',
'vendorName': 'TechCorp Solutions Inc.',
'clientName': 'Global Enterprise Ltd.',
'contractTitle': 'Master Services Agreement',
'effectiveDate': '2024-01-15',
'endDate': '2026-01-15',
'signingDate': '2024-01-10',
'status': 'Active',
'compensation': 250000,
'terminationTerms': 'This Agreement shall commence on the Effective Date and continue for a period of 24 months, ending on January 15, 2026',
'paymentTerms': 'Payment shall be made in monthly installments of $10,416.67, due within 30 days of invoice date',
'currency': 'USD',
'parentContractId': null,
'amendmentNumber': null,
'sourceFileName': 'MSA_TechCorp_Global_2024.pdf'
}
"""



Let's create the functions needed to extract the fields as well as handle a few other important items.  In order to create indexes and update an index you need to be using the admim key.

In [None]:

def llm_extraction(full_text):

    messages = [{"role": "system", "content": contract_extraction_prompt}]
    messages.append({"role": "user", "content": full_text})

    response = primary_llm_json.invoke(messages)
    extraction_json = json.loads(response.content)


    return extraction_json

def generate_sas_url(blob_name: str) -> str:
    """
    Generate a full URL with SAS token for a specific blob
    """
    # Define the permissions for the SAS token
    sas_permissions = BlobSasPermissions(read=True)
    
    # Set token expiry time using timezone-aware datetime
    expiry_time = datetime.now(UTC) + timedelta(hours=1)
    
    # Generate the SAS token
    sas_token = generate_blob_sas(
        account_name=storage_account_name,
        account_key=storage_account_key,
        container_name=container_name,
        blob_name=blob_name,
        permission=sas_permissions,
        expiry=expiry_time
    )
    
    # Construct the full URL including the SAS token
    blob_url = f"https://{storage_account_name}.blob.core.windows.net/{container_name}/{blob_name}?{sas_token}"
    
    return blob_url

def read_pdf(input_file: str) -> str:
    """
    Read and analyze a PDF file using Azure Document Intelligence.
    """
    try:
        # Get the URL with SAS token
        document_url = generate_sas_url(input_file)
        
        print(f"Starting document analysis...")
        
        # Create Document Intelligence client
        credential = AzureKeyCredential(key)
        doc_intelligence_client = DocumentIntelligenceClient(endpoint, credential)

        # Begin analysis with the authenticated URL using the correct model ID
        poller = doc_intelligence_client.begin_analyze_document(
            model_id="prebuilt-read",  # Changed from "prebuilt-document" to "prebuilt-read"
            analyze_request={
                "urlSource": document_url
            }
        )

        # Get results
        result = poller.result()
        
        print("Successfully analyzed document")
        return result.content

    except Exception as e:
        print(f"Error analyzing document: {str(e)}")
        raise

def generate_embeddings(text, model="text-embedding-ada-002"): # model = "deployment_name"
    return aoai_client.embeddings.create(input = [text], model=model).data[0].embedding

# I don't think I will use this function
def get_creation_date(pdf_file):

        print(f"Attempting to open {pdf_file}")
        with open(pdf_file, 'rb') as f:
            pdf = PyPDF2.PdfReader(f)
            info = pdf.metadata
            print(info)
            raw_date = info.get('/CreationDate')

            # Strip the 'D:' prefix if it exists and extract only the necessary part (first 14 characters for YYYYMMDDHHMMSS)  
            pdf_date_str = raw_date[2:]  
            pdf_date_str = pdf_date_str[:14]  # Keep only the date and time part YYYYMMDDHHMMSS  
        
            
            pdf_datetime = datetime.strptime(pdf_date_str, '%Y%m%d%H%M%S')  
             
            # Format the datetime object to include microseconds (zeroes) and a 'Z' for UTC  
            return pdf_datetime.strftime("%Y-%m-%dT%H:%M:%S.000000Z") 

def generate_document_id(blob_name):
    """Generate a unique, deterministic ID for a document."""
    unique_string = f"{blob_name}"  # Use first 100 characters of content for uniqueness
    return hashlib.md5(unique_string.encode()).hexdigest()

def list_blobs_in_folder(container_client, folder_name):
    return [blob for blob in container_client.list_blobs() if blob.name.startswith(folder_name)]

def move_blob(source_container_client, destination_container_client, source_blob_name, destination_blob_name):
    source_blob = source_container_client.get_blob_client(source_blob_name)
    destination_blob = destination_container_client.get_blob_client(destination_blob_name)
    
    destination_blob.start_copy_from_url(source_blob.url)
    source_blob.delete_blob()

def populate_index():
    print("Populating index...")
    blob_service_client = BlobServiceClient.from_connection_string(storage_account_connection_string )
    print(f"Connection String: {storage_account_connection_string}")
    container_client = blob_service_client.get_container_client(container_name)
    print(f"Container Name: {container_name}")
    
    blobs = list(container_client.list_blobs())
    print(f"Found {len(blobs)} blobs in the container")
    stage_blobs = blobs 
    # stage_blobs = list_blobs_in_folder(container_client, "source")
    print(f"Found {len(stage_blobs)} blobs in the 'source' folder")
    
    for blob in stage_blobs:
        print(f"Processing {blob.name}")
        print(blob.name)
        
        try:
            full_text = read_pdf(blob.name)
            extraction_json = llm_extraction(full_text)

            document_id = generate_document_id(blob.name)
            contract_id = extraction_json["contractId"]
            vendor_name = extraction_json["vendorName"]
            client_name = extraction_json["clientName"]
            contract_title = extraction_json["contractTitle"]
            effective_date = extraction_json["effectiveDate"]
            end_date = extraction_json["endDate"]
            signing_date = extraction_json["signingDate"]
            status = extraction_json["status"]
            compensation = extraction_json["compensation"]
            parent_contract_id = extraction_json["parentContractId"]
            amendment_number = extraction_json["amendmentNumber"]
            #creation_date = extraction_json["creationdate"]
            content = full_text
            searchVector = generate_embeddings(full_text)
            # current_date = datetime.now(timezone.utc).isoformat()
            source_file_name = os.path.basename(blob.name)
            print(f"Extracted contract ID: {contract_id}")
            print(f"Extracted vendor name: {vendor_name}")
            print(f"Extracted client name: {client_name}")
            print(f"Extracted contract title: {contract_title}")
            print(f"Extracted effective date: {effective_date}")
            print(f"Extracted end date: {end_date}")
            print(f"Extracted signing date: {signing_date}")
            print(f"Extracted compensation: {compensation}")
            print(f"Extracted parent contract ID: {parent_contract_id}")
            print(f"Extracted amendment number: {amendment_number}")
            # print(f"Extracted creation date: {creation_date}")
            print(f"Extracted source file name: {source_file_name}")
            
            document = {
                "id": document_id,
                "contractId": contract_id ,
                "vendorName": vendor_name,
                "clientName": client_name,
                "contractTitle": contract_title,
                "effectiveDate": effective_date,
                "endDate": end_date,
                "signingDate": signing_date,
                "compensation": compensation,
                "parentContractId": parent_contract_id,
                "amendmentNumber": amendment_number,
                "content": content,
                "searchVector": searchVector
            }
            
            search_client.upload_documents(documents=[document])
            
            # Move the processed file to the 'processed' folder
            destination_blob_name = blob.name.replace("source/", "processed/")
            move_blob(container_client, container_client, blob.name, destination_blob_name)
            
            print(f"Successfully processed and moved {blob.name}")
        
        except Exception as e:
            print(f"Error processing {blob.name}: {str(e)}")

# Example usage
if __name__ == "__main__":
    try:
        populate_index()
        print("\nPopulating Index has started...")
    except Exception as e:
        print(f"Failed to populate the index: {str(e)}")