In [9]:
# openai_client.py

import time
import os
import json
import logging
from dotenv import load_dotenv
from azure.identity import ClientSecretCredential
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from openai import AzureOpenAI

# Load environment variables
load_dotenv()

# Configure logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Azure Credentials
credential = ClientSecretCredential(
    tenant_id=os.getenv("AZURE_TENANT_ID"),
    client_id=os.getenv("AZURE_CLIENT_ID"),
    client_secret=os.getenv("AZURE_CLIENT_SECRET"),
)

# Globals
TOKEN_USAGE = {}
TOTAL_API_CALL = 0
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")

# Rebuild model to fix potential Pydantic issues
AzureChatOpenAI.model_rebuild()

# Initialize Chat LLM client
def llm():
    access_token = credential.get_token("https://cognitiveservices.azure.com/.default").token
    return AzureChatOpenAI(
        azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
        api_version=os.getenv("AZURE_API_VERSION"),
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
        openai_api_key=access_token,
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    )

# Initialize base OpenAI client
def openai_llm():
    access_token = credential.get_token("https://cognitiveservices.azure.com/.default").token
    return AzureOpenAI(
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_version=os.getenv("AZURE_API_VERSION"),
        api_key=access_token,
    )

# Initialize embeddings client
def embeddings():
    access_token = credential.get_token("https://cognitiveservices.azure.com/.default").token
    return AzureOpenAIEmbeddings(
        model=os.getenv("EMBEDDING_AZURE_OPENAI_DEPLOYMENT"),
        azure_endpoint=os.getenv("EMBEDDING_AZURE_OPENAI_ENDPOINT"),
        openai_api_version=os.getenv("AZURE_API_VERSION"),
        api_key=access_token,
    )

# Token usage logger
def add_token_usage_logs(llm_output, message=""):
    token_usage_string = ""
    for key, value in llm_output.to_dict().get("usage", {}).items():
        token_usage_string += f"{key}: {value} | "
    logger.info(f"{message} {token_usage_string}")
    return llm_output.to_dict().get("usage", {}).get("total_tokens", 0)

# Send OpenAI call with sys + user prompt
def openai_call(sys_prompt, prompt_struc, deployment_name=AZURE_OPENAI_DEPLOYMENT, additional_message=""):
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": prompt_struc},
    ]
    return _call_openai(messages, deployment_name, additional_message)

# Send OpenAI call with custom message structure
def openai_call_only_message(messages, deployment_name=AZURE_OPENAI_DEPLOYMENT, additional_message=""):
    return _call_openai(messages, deployment_name, additional_message)

# Retry wrapper for OpenAI calls
def _call_openai(messages, deployment_name, additional_message):
    global TOTAL_API_CALL
    max_retries = 5
    for current_retry in range(max_retries):
        try:
            time.sleep(5)
            logger.info(f"Calling OpenAI API with deployment: {deployment_name}, Retry: {current_retry + 1}")
            start_time = time.time()
            response = openai_llm().chat.completions.create(
                model=deployment_name,
                messages=messages,
                temperature=0,
                # response_format removed for non-JSON prompt compatibility
            )
            TOTAL_API_CALL += 1
            duration = round(time.time() - start_time, 3)
            output = response.choices[0].message.content
            if output:
                msg = f"OPENAI CALL MESSAGE: Retry {current_retry + 1} | Func: {additional_message} | Duration: {duration}s"
                total_tokens = add_token_usage_logs(response, message=msg)
                TOKEN_USAGE[additional_message] = TOKEN_USAGE.get(additional_message, 0) + total_tokens
                logger.info(f"Token Usage Summary: {TOKEN_USAGE}")
                print("\n🔹 OpenAI Response:\n", output)
                return output
        except Exception as e:
            logger.error(f"OpenAI Call Error: {e}")
    logger.warning("Maximum retries reached. No response returned.")
    return None

# Embedding generation with retry
def get_embedding(text, additional_message=""):
    max_retries = 5
    for current_retry in range(max_retries):
        try:
            logger.info(f"Calling Embedding API | Retry: {current_retry + 1} | Caller: {additional_message}")
            time.sleep(5)
            return embeddings().embed_query(text)
        except Exception as e:
            logger.error(f"Embedding Error: {e}")
    logger.warning("Failed to get embedding after retries.")
    return None


# 🔹 Sample Execution for Direct Prompt Testing
if __name__ == "__main__":
    system_prompt = "You are a helpful assistant that explains technical concepts in simple terms."
    user_prompt = "Explain the difference between AI, machine learning, and deep learning."

    openai_call(
        sys_prompt=system_prompt,
        prompt_struc=user_prompt,
        additional_message="sample_direct_prompt"
    )



🔹 OpenAI Response:
 Sure! Let’s break it down step by step in simple terms:

### 1. **Artificial Intelligence (AI):**
   - **What it is:** AI is the big umbrella term. It refers to the idea of creating machines or software that can perform tasks that typically require human intelligence.
   - **Examples:** AI includes things like recognizing speech, playing chess, understanding language, or making decisions.
   - **Key point:** AI is the broadest concept, and it includes many different approaches and techniques to make machines "smart."

---

### 2. **Machine Learning (ML):**
   - **What it is:** Machine learning is a subset of AI. It’s a way to teach machines to learn from data instead of being explicitly programmed for every single task.
   - **How it works:** You give the machine a lot of data, and it finds patterns or relationships in that data. Then, it uses those patterns to make predictions or decisions.
   - **Examples:** 
     - A spam filter that learns to recognize spam ema

In [6]:
pip install langchain_openai

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain_openai
  Downloading langchain_openai-0.3.28-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-core<1.0.0,>=0.3.68 (from langchain_openai)
  Downloading langchain_core-0.3.69-py3-none-any.whl.metadata (5.8 kB)
Collecting openai<2.0.0,>=1.86.0 (from langchain_openai)
  Downloading openai-1.96.1-py3-none-any.whl.metadata (29 kB)
Collecting tiktoken<1,>=0.7 (from langchain_openai)
  Downloading tiktoken-0.9.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting langsmith>=0.3.45 (from langchain-core<1.0.0,>=0.3.68->langchain_openai)
  Downloading langsmith-0.4.6-py3-none-any.whl.metadata (15 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<1.0.0,>=0.3.68->langchain_openai)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting PyYAML>=5.3 (from langchain-core<1.0.0,>=0.3.68->langchain_openai)
  Downloading PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64

In [7]:
import os
import logging
from typing import List, Dict
from pathlib import Path
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
import json

# ----------- Azure Setup -----------
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

if not AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT or not AZURE_DOCUMENT_INTELLIGENCE_KEY:
    raise EnvironmentError("Please set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and AZURE_DOCUMENT_INTELLIGENCE_KEY in your environment variables.")

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
    credential=AzureKeyCredential(AZURE_DOCUMENT_INTELLIGENCE_KEY)
)

# ----------- OCR Function -----------
def process_file_new_ocr(file_path: str) -> List[Dict]:
    """
    Extracts text from a PDF using Azure Document Intelligence (prebuilt-read model).
    
    Args:
        file_path (str): Path to the local PDF file.
    
    Returns:
        List[Dict]: A list of dicts with title, page number, and extracted content.
    """
    file_path = Path(file_path)
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    
    logging.info(f"Processing file: {file_path.name}")

    with open(file_path, "rb") as f:
        document_bytes = f.read()

    poller = document_intelligence_client.begin_analyze_document(
        model_id="prebuilt-read",
        body=document_bytes,
    )
    result = poller.result()

    attachment_list = []
    for page in result.pages:
        text = " ".join([line.content for line in page.lines])
        attachment_list.append({
            "title": file_path.name,
            "pagenum": page.page_number,
            "content": text
        })

    return attachment_list

# ----------- Example Usage -----------
if __name__ == "__main__":
    test_file = "R&D Supplier Brief - Chocolate Coating for Murray Street.pdf"  # Replace with your actual file name

    try:
        output = process_file_new_ocr(test_file)
        print(json.dumps(output, indent=2))
    except Exception as e:
        print(f"Error occurred: {e}")


[
  {
    "title": "R&D Supplier Brief - Chocolate Coating for Murray Street.pdf",
    "pagenum": 1,
    "content": "Bulla\u00ae FAMILY DAIRY SINCE 1910 - Bulla Dairy Foods - R&D Supplier Brief Form 1. PROJECT INFORMATION Date 7-May-25 Technical Lead Michelle Gardiner Project Number 230513 Project Name Murray St Chunky Sticks Project Type New Product Development 2. FINISHED PRODUCT INFORMATION Brand Murray St Product/s Ice Cream Sticks and Sandwiches Competitor Products Connoisseur Classic Vanilla Ice Cream 4 Pack | 400mL Claims Material needs to be compliant for Codex Standard for Milk Chocolate. Project Objective To develop a milk chocolate coating for the purpose of enrobing ice cream sticks and sandwiches. The chocolate must have excellent melting and coating characteristics, be stable for frozen applications, and deliver a premium flavour and texture experience. It must be able to hold inclusions in the coating. When consumed, it must provide a 'snap' when bitten into. Target Shel

In [5]:
pip install azure-ai-documentintelligence


Defaulting to user installation because normal site-packages is not writeable
Collecting azure-ai-documentintelligence
  Downloading azure_ai_documentintelligence-1.0.2-py3-none-any.whl.metadata (53 kB)
Downloading azure_ai_documentintelligence-1.0.2-py3-none-any.whl (106 kB)
Installing collected packages: azure-ai-documentintelligence
Successfully installed azure-ai-documentintelligence-1.0.2
Note: you may need to restart the kernel to use updated packages.
