In [8]:

import json
from typing import List
import logging
from pydantic import BaseModel
import instructor
from openai import AzureOpenAI
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up AzureOpenAI client
azure_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-02-01",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
)

# Patch the AzureOpenAI client with instructor
client = instructor.patch(azure_client)


class WorkersCompClaim(BaseModel):
    incident_description: str
    chain_of_thought: str
    injury_source: str


def generate_claims(
    injury_sources: List[str], num_entries: int
) -> List[WorkersCompClaim]:
    all_claims = []
    for source in injury_sources:
        prompt = f"""
        Generate {num_entries} workers compensation claims related to the injury source: "{source}".
        Each claim should include:
                1. An incident description written as if by the injured worker. Vary in detail, clarity, and style. Include complex scenarios with multiple objects/actions.
                2. Expert reasoning (20+ years experience) towards determining the InjurySource. Use nuanced, domain-specific knowledge and claim coding standards.
                3. The determined injury source, using predefined AWCBC codes.

        Guidelines:
        - Ensure incident descriptions genuinely resemble reports written by workers, not professionals.
        - Vary vocabulary, phrases, and linguistic patterns significantly.
        - Use indirect indicators that allow inference of the injury source.
        - Include diverse locations and situations.
        - Make incident descriptions lengthier and more challenging over time.
        - Ensure expert-level complexity in the reasoning.

        Do not use placeholder text or repeat the same claim multiple times.
        """

        try:
            claims = client.chat.completions.create(
                model="WHI",  # Replace with your actual deployment name
                response_model=List[WorkersCompClaim],
                messages=[{"role": "user", "content": prompt}],
            )
            all_claims.extend(claims)
        except Exception as e:
            logger.error(f"Error generating claims for {source}: {str(e)}")

    return all_claims


# Generate claims
injury_sources = [
    "Metal chips, particles",
    "Boxes, crates, cartons",
    "Knives",
    "Floors, walkways, ground surfaces",
    "Ladders, movable",
]
# Generate claims
num_entries = 1
claims = generate_claims(injury_sources, num_entries)

# Save the dataset
with open("synthetic_workers_comp_claims.json", "w") as f:
    json.dump([claim.dict() for claim in claims], f, indent=2)

logger.info(
    f"Generated {len(claims)} WorkersCompClaims across {len(injury_sources)} injury sources."
)
logger.info("Dataset saved as 'synthetic_workers_comp_claims.json'")

INFO:httpx:HTTP Request: POST https://test-025.openai.azure.com//openai/deployments/WHI/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://test-025.openai.azure.com//openai/deployments/WHI/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://test-025.openai.azure.com//openai/deployments/WHI/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://test-025.openai.azure.com//openai/deployments/WHI/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://test-025.openai.azure.com//openai/deployments/WHI/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:__main__:Generated 5 WorkersCompClaims across 5 injury sources.
INFO:__main__:Dataset saved as 'synthetic_workers_comp_claims.json'
