In [1]:
# !pip install pymupdf requests tqdm

import os
import fitz  # PyMuPDF
import base64
import json
import requests
import re
from tqdm.auto import tqdm


In [2]:
def pdf_page_to_base64(pdf_path, page_number):
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number)
    pix = page.get_pixmap(dpi=300)
    return base64.b64encode(pix.tobytes("png")).decode("utf-8")

In [3]:
def ocr_with_ollama(image_base64):
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": "scb10x/typhoon-ocr-7b:latest",
        "prompt": "อ่านข้อความราชการจากภาพนี้เป็นภาษาไทย",
        "images": [image_base64]
    }

    response = requests.post(url, json=payload, stream=True)
    response.raise_for_status()

    full_text = ""
    for line in response.iter_lines():
        if line:
            data = json.loads(line.decode("utf-8"))
            full_text += data.get("response", "")

    return full_text.strip()


In [None]:
def extract_input(text):

    text = re.sub(r'["\']?natural_text["\']?\s*:\s*', '', text)
    text = re.sub(r'[{}"]+', '', text)
    
    # ตัดเนื้อหาก่อนหน้า '๑.' ทิ้ง
    start = re.search(r"(๑\..+)", text, re.DOTALL)
    if not start:
        return ""
    text = start.group(1)

    # ตัดเมื่อเจอ "จึงเรียนมาเพื่อ" หรือ "จึงเสนอมาเพื่อ"
    cutoff_match = re.search(r"(.*?)\b(จึง(เรียน|เสนอ)มาเพื่อ.*?)$", text, re.DOTALL)
    if cutoff_match:
        text = cutoff_match.group(1).strip()

    # แยกข้อ ๑ – ๔ แบบเดิม
    part1 = re.search(r"(๑\.\s*(ด้วย|ตามอ้างถึง).*?)(?=\n\s*๒\.|$)", text, re.DOTALL)
    part2 = re.search(r"(๒\..*?)(?=\n\s*๓\.|$)", text, re.DOTALL)
    part3 = re.search(r"(๓\..*?)(?=\n\s*๔\.|$)", text, re.DOTALL)
    part4 = re.search(r"(๔\..*)", text, re.DOTALL)

    parts = []
    if part1: parts.append(part1.group(1).strip())
    if part2: parts.append(part2.group(1).strip())
    if part3: parts.append(part3.group(1).strip())
    if part4: parts.append(part4.group(1).strip())

    # ต้องมีข้อ ๑ และ ๒ อย่างน้อย
    if len(parts) < 2:
        return ""

    return " ".join(parts)


In [None]:
def extract_output(text):
    # ลบ noise เช่น "natural_text": หรือ JSON
    text = re.sub(r'["\']?natural_text["\']?\s*:\s*', '', text)
    text = re.sub(r'[{}"]+', '', text)

    # เริ่มจาก '๑.' ที่เป็นข้อความจริง
    start_match = re.search(r"(๑\.\s*(ด้วย|ตามอ้างถึง).*?)", text, re.DOTALL)
    if not start_match:
        return ""

    text = text[start_match.start():]

    # หยุดที่ '๒.' หรือ 'จึงเสนอมาเพื่อ' หรือ 'จึงเรียนมาเพื่อ'
    stop_match = re.search(r"(.*?)\n\s*(๒\.\s*(เพื่อให้|ตามข้อ ๑).*?)", text, re.DOTALL)
    if stop_match:
        return stop_match.group(1).strip()

    return text.strip()


In [6]:
pdf_dir = "./GovernmentDocs"
output_dir = "./ocr_json"
os.makedirs(output_dir, exist_ok=True)

In [None]:
from tqdm.notebook import tqdm

file_list = [f"{i:03}.pdf" for i in range(1000)]
file_list = [f for f in file_list if os.path.exists(os.path.join(pdf_dir, f))]

for pdf_name in tqdm(file_list, desc="🔄 Processing PDF files", unit="file"):
    print(pdf_name)
    pdf_path = os.path.join(pdf_dir, pdf_name)
    output_json = os.path.join(output_dir, pdf_name.replace(".pdf", ".json"))

    try:
        # Page check
        with fitz.open(pdf_path) as doc:
            if doc.page_count != 2:
                print(f"⚠️ Skip {pdf_name} (has {doc.page_count} pages)")
                continue

        image_output = pdf_page_to_base64(pdf_path, 0)
        print(f" ✅ The Outgoing letter of {pdf_name} has been converted to image")
        image_input = pdf_page_to_base64(pdf_path, 1)
        print(f" ✅ The Incoming letter of {pdf_name} has been converted to image")

        output_text = ocr_with_ollama(image_output)
        print(f" ✅ The Outgoing letter of {pdf_name} is done with text extraction")
        input_text = ocr_with_ollama(image_input)
        print(f" ✅ The Incoming letter of {pdf_name} is done with text extraction")

        output_text = extract_output(output_text)
        print(f" ✅ The Outgoing letter of {pdf_name} is done with text filtering")
        input_text = extract_input(input_text)
        print(f" ✅ The Incoming letter of {pdf_name} is done with text filtering")

        result = {
            "file": pdf_name,
            "input_text": input_text,
            "output_text": output_text
        }

        with open(output_json, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        
        print(f"✅ Saved: {output_json}")

    except Exception as e:
        print(f"❌ Error in {pdf_name}: {e}")


🔄 Processing PDF files:   0%|          | 0/42 [00:00<?, ?file/s]

000.pdf
 ✅ The Outgoing letter of 000.pdf has been converted to image
 ✅ The Incoming letter of 000.pdf has been converted to image
 ✅ The Outgoing letter of 000.pdf is done with text extraction
 ✅ The Incoming letter of 000.pdf is done with text extraction
 ✅ The Outgoing letter of 000.pdf is done with text filtering
 ✅ The Incoming letter of 000.pdf is done with text filtering
✅ Saved: ./ocr_json\000.json
001.pdf
 ✅ The Outgoing letter of 001.pdf has been converted to image
 ✅ The Incoming letter of 001.pdf has been converted to image
 ✅ The Outgoing letter of 001.pdf is done with text extraction
 ✅ The Incoming letter of 001.pdf is done with text extraction
 ✅ The Outgoing letter of 001.pdf is done with text filtering
 ✅ The Incoming letter of 001.pdf is done with text filtering
✅ Saved: ./ocr_json\001.json
002.pdf
 ✅ The Outgoing letter of 002.pdf has been converted to image
 ✅ The Incoming letter of 002.pdf has been converted to image
 ✅ The Outgoing letter of 002.pdf is done with

In [None]:
# pdf_path = "./GovernmentDocs/000.pdf"

# image1 = pdf_page_to_base64(pdf_path, 1)  # หนังสือรับ
# image0 = pdf_page_to_base64(pdf_path, 0)  # หนังสือส่ง

# print("📤 Sending to Typhoon OCR (หน้า 2)...")
# input_text = ocr_with_ollama(image1)

# print("📤 Sending to Typhoon OCR (หน้า 1)...")
# output_text = ocr_with_ollama(image0)

# print("🎯 Input:", input_text[:200])
# print("🎯 Output:", output_text[:200])
