Now, we will add the LLM into the mix and extract some files from the contents of the file.

In [1]:
import os, getpass
from langchain_openai import AzureChatOpenAI

from dotenv import load_dotenv
import os, getpass
from pathlib import Path

import json

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

# Get root directory path
root_dir = Path().absolute().parent
env_path = root_dir / '.env'

# Load .env from root
load_dotenv(dotenv_path=env_path)
print(f"Loaded .env from {env_path}")
# Access variables
api_key = os.getenv('AZURE_OPENAI_API_KEY')
debug = os.getenv('DEBUG')
more_research = os.getenv('MORE_RESEARCH')

print(f"API Key: {  api_key[:4] + '*' * 28 + api_key[-4:] }")

Loaded .env from c:\Users\rickcau\source\repos\vendor-contracts-gen-ai\.env
API Key: 8PVz****************************Isv1


Next, let's load some very important endpoint data and create a couple LLM references for use later.

In [2]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.storage.blob import BlobServiceClient
from azure.storage.blob import generate_blob_sas
from azure.storage.blob import BlobSasPermissions
from datetime import datetime, timedelta, UTC  # Added UTC

# Azure Document Intelligence settings
endpoint = os.getenv('FORM_RECOGNIZER_ENDPOINT')
key = os.getenv('FORM_RECOGNIZER_KEY')

# Azure Storage settings
storage_account_name = os.getenv('STORAGE_ACCOUNT_NAME')
storage_account_key = os.getenv('STORAGE_ACCOUNT_KEY')
container_name = "source"

# Azure OpenAI
aoai_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
aoai_key = os.getenv("AZURE_OPENAI_API_KEY")
aoai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")

primary_llm = AzureChatOpenAI(
    azure_deployment=aoai_deployment,
    api_version="2024-05-01-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=aoai_key,
    azure_endpoint=aoai_endpoint
)

primary_llm_json = AzureChatOpenAI(
    azure_deployment=aoai_deployment,
    api_version="2024-05-01-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=aoai_key,
    azure_endpoint=aoai_endpoint,
    model_kwargs={"response_format": {"type": "json_object"}}
)

Let's create a prompt that can be used by the LLM to extract data from the contents provided by Document Intelligence.

In [4]:
contract_extraction_prompt = """You are an AI assistant. Your job is to read the input contract, 
and output certain info in valid JSON format. Here is what you should be extracting:
1. id - unique identifier for the document
2. contractId - unique identifier for the contract
3. vendorName - name of the vendor/supplier/contractor
4. clientName - name of the client/customer
5. contractTitle - title of the contract document
6. effectiveDate - when the contract becomes effective
7. endDate - when the contract expires
8. signingDate - when the contract was signed
9. status - current status of the contract (e.g., Active, Expired, Pending)
10. compensation - monetary value of the contract
11. paymentTerms - terms of payment
12. currency - currency used in the contract
13. parentContractId - ID of the parent contract (if this is an amendment)
14. amendmentNumber - amendment number (if applicable)
15. date - current date
16. sourceFileName - name of the source file

#Examples#
User: MASTER SERVICES AGREEMENT
This Master Services Agreement (the "Agreement") is made effective as of January 15, 2024 (the "Effective Date"), by and between:
TechCorp Solutions Inc. ("Vendor")
123 Tech Lane
Silicon Valley, CA 94025
and
Global Enterprise Ltd. ("Client")
456 Business Park
New York, NY 10001
Contract ID: MSA-2024-001
1. SERVICES
The Vendor agrees to provide software development services as outlined in Exhibit A.
2. TERM
This Agreement shall commence on the Effective Date and continue for a period of 24 months, ending on January 15, 2026.
3. COMPENSATION
Client agrees to pay Vendor a total of $250,000 USD for the services rendered.
4. PAYMENT TERMS
Payment shall be made in monthly installments of $10,416.67, due within 30 days of invoice date.
Signed and executed on: January 10, 2024
Status: Active
File: MSA_TechCorp_Global_2024.pdf

Assistant: {
'id': 'DOC-20240115-001',
'contractId': 'MSA-2024-001',
'vendorName': 'TechCorp Solutions Inc.',
'clientName': 'Global Enterprise Ltd.',
'contractTitle': 'Master Services Agreement',
'effectiveDate': '2024-01-15',
'endDate': '2026-01-15',
'signingDate': '2024-01-10',
'status': 'Active',
'compensation': 250000,
'paymentTerms': 'Payment shall be made in monthly installments of $10,416.67, due within 30 days of invoice date',
'currency': 'USD',
'parentContractId': null,
'amendmentNumber': null,
'date': '2024-12-07',
'sourceFileName': 'MSA_TechCorp_Global_2024.pdf'
}
"""



Now, let's use the LLM to extract the data from the file into a JSON object which then could be used to insert a row into SQL.

In [5]:

def llm_extraction(full_text):

    messages = [{"role": "system", "content": contract_extraction_prompt}]
    messages.append({"role": "user", "content": full_text})

    response = primary_llm_json.invoke(messages)
    extraction_json = json.loads(response.content)


    return extraction_json

def generate_sas_url(blob_name: str) -> str:
    """
    Generate a full URL with SAS token for a specific blob
    """
    # Define the permissions for the SAS token
    sas_permissions = BlobSasPermissions(read=True)
    
    # Set token expiry time using timezone-aware datetime
    expiry_time = datetime.now(UTC) + timedelta(hours=1)
    
    # Generate the SAS token
    sas_token = generate_blob_sas(
        account_name=storage_account_name,
        account_key=storage_account_key,
        container_name=container_name,
        blob_name=blob_name,
        permission=sas_permissions,
        expiry=expiry_time
    )
    
    # Construct the full URL including the SAS token
    blob_url = f"https://{storage_account_name}.blob.core.windows.net/{container_name}/{blob_name}?{sas_token}"
    
    return blob_url

def read_pdf(input_file: str) -> str:
    """
    Read and analyze a PDF file using Azure Document Intelligence.
    """
    try:
        # Get the URL with SAS token
        document_url = generate_sas_url(input_file)
        
        print(f"Starting document analysis...")
        
        # Create Document Intelligence client
        credential = AzureKeyCredential(key)
        doc_intelligence_client = DocumentIntelligenceClient(endpoint, credential)

        # Begin analysis with the authenticated URL using the correct model ID
        poller = doc_intelligence_client.begin_analyze_document(
            model_id="prebuilt-read",  # Changed from "prebuilt-document" to "prebuilt-read"
            analyze_request={
                "urlSource": document_url
            }
        )

        # Get results
        result = poller.result()
        
        print("Successfully analyzed document")
        return result.content

    except Exception as e:
        print(f"Error analyzing document: {str(e)}")
        raise

# Example usage
if __name__ == "__main__":
    try:
        content = read_pdf("VendorAgreement-Fabrikam-5004432.pdf")
        print("\nExtracted content:")
        print(content)
        
        jsonresponse = llm_extraction(content)
        print("\nExtracted JSON:")
        print(jsonresponse)
    except Exception as e:
        print(f"Failed to process document: {str(e)}")

Starting document analysis...
Successfully analyzed document

Extracted content:
Vendor Contractor Agreement
Contract ID: 5004432
This Vendor Contractor Agreement ("Agreement") is entered into as of 12/7/2024 by and between:
. [Contoso Elite] (the "Company"), with its principal office located at [Address], and
· [Fabrikam Services] (the "Vendor"), with its principal office located at [Address].
1. Services
The Vendor agrees to provide the following services:
. 200 Hours of Developer Support for GenAl Contracts Project
The services must be completed by [Feb 20, 2024], as outlined in Exhibit A (Scope of Work).
2. Compensation
The Company will pay the Vendor [$20,000 US] upon completion of services. Payments will be made within [10] business days after receiving an invoice from the Vendor.
3. Term
This Agreement begins on [12/14/2024] and ends on [Feb 20, 2024].
4. Confidentiality
The Vendor agrees to maintain the confidentiality of any proprietary information provided by the Company and 