In [None]:
from openai import OpenAI

client = OpenAI(api_key="")

resp = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": "say hi"}]
)

print(resp.choices[0].message.content)


Hi! How can I assist you today?


In [2]:

def check_limits():
    try:
        # 2. Make a tiny request (1 token) to get the headers
        # We use .with_raw_response to access the hidden HTTP headers
        response = client.chat.completions.with_raw_response.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": "Hi"}],
            max_tokens=1  # Minimize cost
        )

        # 3. Extract the specific headers
        headers = response.headers
        
        print(f"--- RATE LIMITS FOR GPT-4o-MINI ---")
        print(f"1. Requests Limit (RPM): {headers.get('x-ratelimit-limit-requests')}")
        print(f"2. Tokens Limit (TPM):   {headers.get('x-ratelimit-limit-tokens')}")
        print(f"3. Remaining Requests:   {headers.get('x-ratelimit-remaining-requests')}")
        print(f"4. Remaining Tokens:     {headers.get('x-ratelimit-remaining-tokens')}")
        print(f"5. Time to Reset:        {headers.get('x-ratelimit-reset-requests')}")
        
    except Exception as e:
        print(f"Error checking limits: {e}")

if __name__ == "__main__":
    check_limits()

--- RATE LIMITS FOR GPT-4o-MINI ---
1. Requests Limit (RPM): 30000
2. Tokens Limit (TPM):   150000000
3. Remaining Requests:   29999
4. Remaining Tokens:     149999997
5. Time to Reset:        2ms


In [3]:
!pip install pymupdf pillow tenacity

Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-win_amd64.whl (18.4 MB)
   ---------------------------------------- 0.0/18.4 MB ? eta -:--:--
   ------------------- -------------------- 8.9/18.4 MB 49.0 MB/s eta 0:00:01
   ------------------------- -------------- 11.5/18.4 MB 29.5 MB/s eta 0:00:01
   -------------------------- ------------- 12.3/18.4 MB 21.5 MB/s eta 0:00:01
   --------------------------- ------------ 12.8/18.4 MB 17.0 MB/s eta 0:00:01
   ----------------------------- ---------- 13.6/18.4 MB 13.0 MB/s eta 0:00:01
   ------------------------------- -------- 14.4/18.4 MB 11.7 MB/s eta 0:00:01
   ---------------------------------- ----- 15.7/18.4 MB 10.8 MB/s eta 0:00:01
   ------------------------------------- -- 17.3/18.4 MB 10.3 MB/s eta 0:00:01
   ---------------------------------------- 18.4/18.4 MB 9.7 MB/s eta 0:00:00
Installing collected packages: pymupdf
Successfully installed pymupd

In [4]:
import os
import io
import json
import base64
import time
import fitz  # PyMuPDF
from PIL import Image
from dotenv import load_dotenv
from openai import OpenAI, RateLimitError
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
    retry_if_exception_type
)

In [None]:
PDF_PATH = "coke.pdf"  # <--- Make sure this matches your file name
API_KEY = ""

In [6]:
BATCH_SIZE = 1 

if not API_KEY:
    raise ValueError("No API Key found. Please check your .env file.")

client = OpenAI(api_key=API_KEY)

In [7]:
SYSTEM_PROMPT = """
You are a Lead Forensic Compliance Auditor.
Task: Analyze the provided Brand Guideline pages and extract a "Digital Rulebook" in JSON.

INSTRUCTIONS:
1. VISUAL DECODING: For Example : Look at "Don't" grids. If a logo is rotated in a "Don't" image, create a rule: "max_rotation: 0".
2. PARAMETERS: Extract hard numbers (hex codes, pixels, angles).
3. FILTERING: Ignore Table of Contents or general marketing fluff.
4. FORMAT: Output valid JSON with this structure, an example:
{
  "rules": [
    {
      "category": "Geometry|Color|Typography|Imagery",
      "rule_text": "Logo must not be rotated.",
      "parameters": {"max_rotation": 0}
    }
  ]
}
"""

In [8]:
def encode_image(pil_image):
    """Resizes image to save tokens (Crucial for Tier 1 accounts)"""
    img_copy = pil_image.copy()
    # Resize to 768px (Standard Vision Tile size for Mini)
    img_copy.thumbnail((768, 768))
    
    buffered = io.BytesIO()
    # Low quality (50) reduces file size significantly
    img_copy.save(buffered, format="JPEG", quality=50)
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

In [9]:
@retry(
    wait=wait_random_exponential(min=1, max=60),
    stop=stop_after_attempt(6),
    retry=retry_if_exception_type(RateLimitError)
)
def extract_from_batch_with_backoff(messages):
    return client.chat.completions.create(
        model="gpt-4o-mini", # <--- Using Mini to save cost/tokens
        messages=messages,
        response_format={"type": "json_object"},
        temperature=0,
        max_tokens=4096
    )

In [10]:
def process_batch(images, batch_index):
    print(f"   ...Processing Batch {batch_index + 1}...")
    
    image_payload = []
    for img in images:
        image_payload.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{encode_image(img)}"}
        })

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": [
            {"type": "text", "text": "Extract all compliance rules from these pages."}
        ] + image_payload}
    ]

    try:
        response = extract_from_batch_with_backoff(messages)
        result = json.loads(response.choices[0].message.content)
        return result.get("rules", [])
        
    except Exception as e:
        print(f"   [FAIL] Batch {batch_index + 1} error: {e}")
        # If we fail, force a long cooldown
        print("   >>> Cooling down for 60s...")
        time.sleep(60)
        return []

In [11]:
def main():
    if not os.path.exists(PDF_PATH):
        print(f"File not found: {PDF_PATH}")
        return

    print(f"1. Reading PDF ({PDF_PATH})...")
    doc = fitz.open(PDF_PATH)
    total_pages = len(doc)
    
    all_pages = []
    # Convert PDF pages to Images
    for i, page in enumerate(doc):
        pix = page.get_pixmap(dpi=150) # 150 DPI is readable but small
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        all_pages.append(img)
    
    print(f"   Converted {total_pages} pages.")
    
    all_rules = []
    
    print("2. Starting Extraction...")
    
    # Process 1 page at a time
    for i in range(0, total_pages, BATCH_SIZE):
        batch_images = all_pages[i : i + BATCH_SIZE]
        batch_idx = i // BATCH_SIZE
        
        batch_rules = process_batch(batch_images, batch_idx)
        all_rules.extend(batch_rules)
        
        print(f"   -> Batch {batch_idx + 1} done. Found {len(batch_rules)} rules.")
        
        # SAFETY SLEEP: 5 seconds between every request to refill tokens
        time.sleep(5) 

    # Save Final JSON
    with open("compliance_rules.json", "w") as f:
        json.dump({"rules": all_rules}, f, indent=2)
    print(f"\nSUCCESS! Saved {len(all_rules)} rules to 'compliance_rules.json'")

if __name__ == "__main__":
    main()

1. Reading PDF (coke.pdf)...
   Converted 37 pages.
2. Starting Extraction...
   ...Processing Batch 1...
   -> Batch 1 done. Found 3 rules.
   ...Processing Batch 2...
   -> Batch 2 done. Found 6 rules.
   ...Processing Batch 3...
   -> Batch 3 done. Found 6 rules.
   ...Processing Batch 4...
   -> Batch 4 done. Found 12 rules.
   ...Processing Batch 5...
   -> Batch 5 done. Found 4 rules.
   ...Processing Batch 6...
   -> Batch 6 done. Found 1 rules.
   ...Processing Batch 7...
   -> Batch 7 done. Found 7 rules.
   ...Processing Batch 8...
   -> Batch 8 done. Found 3 rules.
   ...Processing Batch 9...
   -> Batch 9 done. Found 3 rules.
   ...Processing Batch 10...
   -> Batch 10 done. Found 2 rules.
   ...Processing Batch 11...
   -> Batch 11 done. Found 6 rules.
   ...Processing Batch 12...
   -> Batch 12 done. Found 3 rules.
   ...Processing Batch 13...
   -> Batch 13 done. Found 1 rules.
   ...Processing Batch 14...
   -> Batch 14 done. Found 3 rules.
   ...Processing Batch 15...
