In [None]:
import sys
from dotenv import load_dotenv

sys.path.append("..")

from karanta.llm_clients.azure_client import AzureOPENAILLM
from karanta.data.process_pdf_utils import render_pdf_to_base64png
from karanta.constants import TARGET_IMAGE_DIM
from karanta.data.utils import (
    load_prompt_template,
    create_vision_message,
    text_order_response_format,
    text_present_response_format,
    text_absent_response_format,
)

load_dotenv()

In [20]:
local_pdf_path = "/Users/odunayoogundepo/Desktop/test_images_karanta/pdf/no_type/_OceanofPDF.com_Colloquial_Yoruba_The_Complete_Course_for_Beginners_-_Antonia_Yetunde_page_246.pdf"
page = 1

image_base64 = render_pdf_to_base64png(local_pdf_path, page, TARGET_IMAGE_DIM)

In [21]:
prompt_template = load_prompt_template(
    "headers_footers_prompt",
    "/Users/odunayoogundepo/Desktop/ocr_training/karanta-ocr/configs/prompts/create_tests.yaml",
)

In [None]:
client = AzureOPENAILLM("gpt-4.1")
messages = create_vision_message(prompt_template, None, image_base64)

# Make API call
response = await client.completion(
    [messages],  # Azure client expects nested structure
    text_present_response_format(),
    temperature=0.1,
    max_tokens=6000,
)

In [None]:
response

In [6]:
document_type = "religion"
test_path = "/Users/odunayoogundepo/Desktop/test_images_karanta/pdf"
processed_file = "processed.txt"
output_path = "/Users/odunayoogundepo/Desktop/test_images_karanta/unit_tests"

In [None]:
import jsonlines

processed_files = set()

with jsonlines.open(f"{output_path}/tests_{document_type}.jsonl", mode="r") as reader:
    for obj in reader:
        processed_files.add(obj["source_file"])

In [None]:
import jsonlines
import os
import random

test_type = [
    "text_present_prompt",
    "text_absent_prompt",
    "headers_footers_prompt",
    "text_order_prompt",
]

client = AzureOPENAILLM("gpt-4.1")


with jsonlines.open(f"{output_path}/tests_{document_type}.jsonl", mode="a") as writer:
    for file in os.listdir(os.path.join(test_path, document_type)):
        if file in processed_files:
            print(f"Skipping already processed file: {file}")
            continue

        try:
            local_pdf_path = os.path.join(test_path, document_type, file)
            image_base64 = render_pdf_to_base64png(local_pdf_path, 1, TARGET_IMAGE_DIM)

            # sample test type
            sampled_test_type = random.choice(test_type)

            prompt_template = load_prompt_template(
                sampled_test_type,
                "/Users/odunayoogundepo/Desktop/ocr_training/karanta-ocr/configs/prompts/create_tests.yaml",
            )

            messages = create_vision_message(prompt_template, None, image_base64)

            if sampled_test_type == "text_present_prompt":
                response_format = text_present_response_format()
            elif sampled_test_type == "text_absent_prompt":
                response_format = text_absent_response_format()
            elif sampled_test_type == "text_order_prompt":
                response_format = text_order_response_format()
            elif sampled_test_type == "headers_footers_prompt":
                response_format = text_present_response_format()

            # Make API call
            response = await client.completion(
                [messages],  # Azure client expects nested structure
                response_format,
                temperature=0.1,
                max_tokens=6000,
            )

            result = response[0].generation["tests"]

            for test_case in result:
                test_case["document_type"] = document_type
                test_case["source_file"] = file

                if sampled_test_type == "headers_footers_prompt":
                    test_case["test_type"] = "headers_footers_prompt"

                writer.write(test_case)

        except Exception as e:
            print(f"Error processing file {file}: {e}")
            continue