In [None]:
import os
import time
import json
import logging
from dotenv import load_dotenv
from typing import List, Dict
from pathlib import Path
from azure.identity import ClientSecretCredential
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from openai import AzureOpenAI

# ------------------ Load environment variables ------------------
load_dotenv()

# ------------------ Logging setup ------------------
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# ------------------ Azure Credentials ------------------
credential = ClientSecretCredential(
    tenant_id=os.getenv("AZURE_TENANT_ID"),
    client_id=os.getenv("AZURE_CLIENT_ID"),
    client_secret=os.getenv("AZURE_CLIENT_SECRET"),
)

# ------------------ Globals ------------------
TOKEN_USAGE = {}
TOTAL_API_CALL = 0
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")

# ------------------ Azure Document Intelligence Client ------------------
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

if not AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT or not AZURE_DOCUMENT_INTELLIGENCE_KEY:
    raise EnvironmentError("Missing Azure Document Intelligence credentials")

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
    credential=AzureKeyCredential(AZURE_DOCUMENT_INTELLIGENCE_KEY)
)

# ------------------ OCR Function ------------------
def process_file_new_ocr(file_path: str) -> List[Dict]:
    file_path = Path(file_path)
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    logging.info(f"Processing file: {file_path.name}")

    with open(file_path, "rb") as f:
        document_bytes = f.read()

    poller = document_intelligence_client.begin_analyze_document(
        model_id="prebuilt-read",
        body=document_bytes,
    )
    result = poller.result()

    attachment_list = []
    for page in result.pages:
        text = " ".join([line.content for line in page.lines])
        attachment_list.append({
            "title": file_path.name,
            "pagenum": page.page_number,
            "content": text
        })

    return attachment_list

# ------------------ Azure OpenAI LLM Clients ------------------
AzureChatOpenAI.model_rebuild()

def llm():
    access_token = credential.get_token("https://cognitiveservices.azure.com/.default").token
    return AzureChatOpenAI(
        azure_deployment=AZURE_OPENAI_DEPLOYMENT,
        api_version=os.getenv("AZURE_API_VERSION"),
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
        openai_api_key=access_token,
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    )

def openai_llm():
    access_token = credential.get_token("https://cognitiveservices.azure.com/.default").token
    return AzureOpenAI(
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_version=os.getenv("AZURE_API_VERSION"),
        api_key=access_token,
    )

def embeddings():
    access_token = credential.get_token("https://cognitiveservices.azure.com/.default").token
    return AzureOpenAIEmbeddings(
        model=os.getenv("EMBEDDING_AZURE_OPENAI_DEPLOYMENT"),
        azure_endpoint=os.getenv("EMBEDDING_AZURE_OPENAI_ENDPOINT"),
        openai_api_version=os.getenv("AZURE_API_VERSION"),
        api_key=access_token,
    )

# ------------------ OpenAI Call Utilities ------------------
def add_token_usage_logs(llm_output, message=""):
    token_usage_string = ""
    for key, value in llm_output.to_dict().get("usage", {}).items():
        token_usage_string += f"{key}: {value} | "
    logger.info(f"{message} {token_usage_string}")
    return llm_output.to_dict().get("usage", {}).get("total_tokens", 0)

def openai_call(sys_prompt, prompt_struc, deployment_name=AZURE_OPENAI_DEPLOYMENT, additional_message=""):
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": prompt_struc},
    ]
    return _call_openai(messages, deployment_name, additional_message)

def _call_openai(messages, deployment_name, additional_message):
    global TOTAL_API_CALL
    max_retries = 5
    for current_retry in range(max_retries):
        try:
            time.sleep(5)
            logger.info(f"Calling OpenAI API with deployment: {deployment_name}, Retry: {current_retry + 1}")
            start_time = time.time()
            response = openai_llm().chat.completions.create(
                model=deployment_name,
                messages=messages,
                temperature=0,
            )
            TOTAL_API_CALL += 1
            duration = round(time.time() - start_time, 3)
            output = response.choices[0].message.content
            if output:
                msg = f"OPENAI CALL MESSAGE: Retry {current_retry + 1} | Func: {additional_message} | Duration: {duration}s"
                total_tokens = add_token_usage_logs(response, message=msg)
                TOKEN_USAGE[additional_message] = TOKEN_USAGE.get(additional_message, 0) + total_tokens
                logger.info(f"Token Usage Summary: {TOKEN_USAGE}")
                print("\n🔹 OpenAI Response:\n", output)
                return output
        except Exception as e:
            logger.error(f"OpenAI Call Error: {e}")
    logger.warning("Maximum retries reached. No response returned.")
    return None

# ------------------ MAIN EXECUTION BLOCK ------------------
if __name__ == "__main__":
    test_file = "R&D Supplier Brief - Chocolate Coating for Murray Street.pdf"

    try:
        ocr_pages = process_file_new_ocr(test_file)
        extracted_text = "\n".join([p["content"] for p in ocr_pages])

        system_prompt = """
You are a domain-aware assistant that extracts structured product development attributes from customer R&D or supplier brief documents.

---

### 🎯 Your Role:
You are interpreting a **customer brief** — a document that includes both direct requirements and contextual information. Your job is to:
1. Extract **explicit** attributes: those that are **directly requested by the customer** in the brief.
2. Extract **inferred** attributes: high-confidence values **logically implied** by the brief’s context, product type, or regulatory standards, with a supporting `"note"`.

⚠️ Just because a value is mentioned in the brief does **not** mean it is "explicit". Only customer **requested** attributes qualify as explicit.

---

### 📌 Mandatory Attributes to Always Return:
Regardless of how they appear in the brief, always include the following attributes in your output:
- `Product_Type`
- `Base_Type` (e.g., dark, milk, white)
- `Moulding_Type` (e.g., chips, blocks, liquid)
- `Components_Specifications` (e.g., emulsifiers, lecithin, PGPR)
- `Fat` (Total fat %, g/100g, etc.)
- `pH` (if mentioned in any context)

If these are **explicitly requested**, place them under `"explicit"`.

If they are **mentioned but not requested**, or not mentioned at all but can be deduced, place them under `"inferred"` with a justification in `"note"`.

---

### 🗂️ Output Categories:
Group extracted attributes under the following **8 standard categories**, each containing:
- `"explicit"`: only those clearly **requested by the customer**
- `"inferred"`: high-confidence deductions with `"note"`

1. allergen_items  
2. claims_certifications  
3. ingredients_composition  
4. legal_specifications  
5. nutritional_values  
6. packaging_information  
7. sales_commercial  
8. technical_specifications

---

### Also include **explicit attributes based on product-type logic**::

#### Chocolate Type-specific (Dark / Milk / White):
- Total fat (% or g/100g)
- Minimum dry cocoa solids (%)
- Dry fat-free cocoa solids (%)
- Milkfat (%) – for milk/white chocolate
- Dry milk solids (%)
- Fineness type (e.g., FP or micrometer)
- Norm linear viscosity (mPa.s)
- Casson viscosity (mPa.s)
- Yield value (Pa)

#### 🍬 Moulding/Shape Specifics:
- Length, Width, Height
- Vibration (drops)
- Primary Count or Count/Unit
- Sieve fraction (if relevant)

#### 🧪 Compound/Fillings:
- Check for “contains hydrogenated” or hydrogenated fats content

#### 🥜 If Nuts are Mentioned:
- % of nuts or quantity

#### 🌍 Export Targets (e.g., EU, US, China):
- Legal declaration required
- Country-specific regulatory compliance
- Typical cocoa content

---

### 🔍 Inference Guidelines:

Use domain knowledge to **infer high-confidence values**:

#### ✅ ingredients_composition
- PGPR or lecithin implies vegetable fats or emulsifiers
- Codex mention → infer standard ranges for cocoa, milk solids, and sugar
- If PGPR ≤ 0.5%, infer presence of vegetable fats (as PGPR works with them)

#### ✅ technical_specifications
- Enrobing or frozen products → infer “Freezer Stability”, “Snap Texture”
- Melting, flow, viscosity mentions → infer “Flowability” or “Processing Characteristics”
- Export shipping → infer “Shelf Stability”

#### ✅ claims_certifications
- RSPO, FSC, Rainforest → infer sustainable sourcing
- “Non-GMO”, “No artificial preservatives/flavors/sweeteners” → infer all “free-from” claims

#### ✅ legal_specifications
- Codex, EU, FDA, or US law references → infer “Legal Declaration Required”
- Export references → infer country-level compliance

#### ✅ nutritional_values
- “No sugar added” → infer “Low Sugar Claim”
- High fat/caloric composition → infer “High Energy Density”

#### ✅ packaging_information
- Box, pouch, bag, sachet → infer “Packaging Format” and durability
- FSC or eco-labels → infer “Sustainable Packaging Compliance”

#### ✅ sales_commercial
- Customer/market-specific formats → infer “Market Target”, “Sales Channel”
- Mention of MOQ, pricing, volume → infer commercial specs (e.g., “MOQ”, “Indicative Volumes”)

#### ✅ allergen_items
- If allergen list (e.g., milk, soy, nuts) is stated or avoided → infer inclusion/exclusion
- “May contain traces…” → infer “Cross-Contamination Risk”

---

### ⚠️ Output Rules:
- Only include `"explicit"` if clearly requested by the customer.
- Include the mandatory 6 attributes always in the output 
- Do **not fabricate** values without strong support.
- Return clean **valid JSON** only — no markdown, comments, or explanations.

---

### ✅ Output Format:
Return structured JSON in this format:

{
  "category_name": {
    "explicit": {
      "Attribute Name": {
        "value": "...",
        "source": "explicit"
      }
    },
    "inferred": {
      "Attribute Name": {
        "value": "...",
        "source": "inferred",
        "note": "brief justification"
      }
    }
  }
}
Each `category_name` must match one of the 8 categories above.
"""

        user_prompt = f"""Here is the extracted text from an R&D document:

--- START OF DOCUMENT ---
{extracted_text}

--- END OF DOCUMENT ---

Now extract and categorize the relevant structured attributes and their values into 8 categories, each with explicit and inferred sections, as per the guidelines. Provide only valid JSON output without commentary or markdown formatting.
"""

        result = openai_call(
            sys_prompt=system_prompt,
            prompt_struc=user_prompt,
            additional_message="extract_attributes_from_rd_brief"
        )

        if result:
            with open("extracted_attributes00170112.json", "w") as f:
                json.dump(json.loads(result), f, indent=2)
                print("\n✅ Extracted attributes saved to 'extracted_attributes00170112.json'")

    except Exception as e:
        print(f"❌ Error occurred: {e}")


INFO:__main__:Calling OpenAI API with deployment: gpt-4o, Retry: 1
INFO:__main__:OPENAI CALL MESSAGE: Retry 1 | Func: extract_attributes_from_rd_brief | Duration: 10.237s completion_tokens: 910 | prompt_tokens: 2035 | total_tokens: 2945 | completion_tokens_details: {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0} | prompt_tokens_details: {'audio_tokens': 0, 'cached_tokens': 1024} | 
INFO:__main__:Token Usage Summary: {'extract_attributes_from_rd_brief': 2945}



🔹 OpenAI Response:
 {
  "allergen_items": {
    "explicit": {},
    "inferred": {
      "Milk": {
        "value": "Present",
        "source": "inferred",
        "note": "White chocolate typically contains milk solids."
      },
      "Soy": {
        "value": "May be present",
        "source": "inferred",
        "note": "Soy lecithin is commonly used as an emulsifier in chocolate products."
      }
    }
  },
  "claims_certifications": {
    "explicit": {},
    "inferred": {
      "Non-GMO": {
        "value": "Not specified",
        "source": "inferred",
        "note": "No explicit mention of GMO-free claims in the document."
      }
    }
  },
  "ingredients_composition": {
    "explicit": {},
    "inferred": {
      "White Chocolate Base": {
        "value": "White chocolate",
        "source": "inferred",
        "note": "Product descriptions consistently refer to white chocolate."
      },
      "Emulsifiers": {
        "value": "Likely includes lecithin",
        "source"

In [None]:
import json
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment

CATEGORIES = [
    "allergen_items", "claims_certifications", "ingredients_composition",
    "legal_specifications", "nutritional_values", "packaging_information",
    "sales_commercial", "technical_specifications"
]

def format_as_json_string(data: dict) -> str:
    if not data:
        return "{}"
    return json.dumps(data, indent=2, ensure_ascii=False)

def json_to_structured_excel(json_file: str, output_excel: str):
    with open(json_file, 'r') as f:
        data = json.load(f)

    rows = []
    for idx, category in enumerate(CATEGORIES, start=1):
        explicit_attrs = data.get(category, {}).get("explicit", {})
        inferred_attrs = data.get(category, {}).get("inferred", {})

        row = {
            "S. No.": idx,
            "Category_Name": category,
            "Explicit Attributes": format_as_json_string(explicit_attrs),
            "Inferred Attributes": format_as_json_string(inferred_attrs)
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    df.to_excel(output_excel, index=False)

    # Adjust formatting using openpyxl
    wb = load_workbook(output_excel)
    ws = wb.active

    # Set column widths (40 characters wide) and wrap text
    for col in range(1, ws.max_column + 1):
        col_letter = get_column_letter(col)
        ws.column_dimensions[col_letter].width = 40
        for row in range(2, ws.max_row + 1):  # Skip header
            cell = ws.cell(row=row, column=col)
            cell.alignment = Alignment(wrap_text=True, vertical="top")

    # Set row height to 409 for all rows
    for row in range(2, ws.max_row + 1):  # Skip header row
        ws.row_dimensions[row].height = 409

    wb.save(output_excel)
    print(f"✅ Excel saved to: {output_excel}")

# Example usage
json_to_structured_excel("extracted_attributes00170112.json", "structured_attribute_output_00170112.xlsx")

✅ Excel saved to: structured_attribute_output_00170112.xlsx


In [None]:
# import json
# import pandas as pd

# # Define the 8 attribute categories
# CATEGORIES = [
#     "allergen_items", "claims_certifications", "ingredients_composition",
#     "legal_specifications", "nutritional_values", "packaging_information",
#     "sales_commercial", "technical_specifications"
# ]

# def format_value(val):
#     if isinstance(val, dict):
#         return "; ".join(f"{k}: {format_value(v)}" for k, v in val.items())
#     elif isinstance(val, list):
#         return ", ".join(str(v) for v in val)
#     return str(val)

# def convert_structured_attributes(json_file: str, output_excel: str):
#     with open(json_file, 'r') as f:
#         attributes_data = json.load(f)

#     # Prepare a structure to hold all values
#     attribute_lookup = {cat: {"explicit": {}, "inferred": {}} for cat in CATEGORIES}
#     all_attribute_names = set()

#     # Extract values and track attribute names
#     for category in CATEGORIES:
#         if category in attributes_data:
#             for source_type in ["explicit", "inferred"]:
#                 items = attributes_data[category].get(source_type, {})
#                 for attr_name, attr_obj in items.items():
#                     value = format_value(attr_obj.get("value", ""))
#                     attribute_lookup[category][source_type][attr_name] = value
#                     all_attribute_names.add(attr_name)

#     # Prepare rows
#     rows = []
#     for attr in sorted(all_attribute_names):
#         row = {"Attribute Name": attr}
#         for category in CATEGORIES:
#             row[f"{category} (explicit)"] = attribute_lookup[category]["explicit"].get(attr, "")
#             row[f"{category} (inferred)"] = attribute_lookup[category]["inferred"].get(attr, "")
#         rows.append(row)

#     # Build DataFrame
#     df = pd.DataFrame(rows)

#     # Ensure all expected columns exist
#     all_columns = ["Attribute Name"] + [f"{cat} (explicit)" for cat in CATEGORIES] + [f"{cat} (inferred)" for cat in CATEGORIES]
#     for col in all_columns:
#         if col not in df.columns:
#             df[col] = ""

#     # Reorder columns
#     df = df[all_columns]

#     # Export to Excel
#     df.to_excel(output_excel, index=False)
#     print(f"✅ Excel file saved to: {output_excel}")

# # Example usage
# convert_structured_attributes("extracted_attributes00170112.json", "categorized_attributes_00170112.xlsx")


✅ Excel file saved to: categorized_attributes_00170112.xlsx
