In [1]:
# !pip install pymupdf requests tqdm

import os
import fitz  # PyMuPDF
import base64
import json
import requests
import re
from tqdm.auto import tqdm


In [2]:
def pdf_page_to_base64(pdf_path, page_number):
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number)
    pix = page.get_pixmap(dpi=300)
    return base64.b64encode(pix.tobytes("png")).decode("utf-8")

In [3]:
def ocr_with_ollama(image_base64):
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": "scb10x/typhoon-ocr-7b:latest",
        "prompt": "‡∏≠‡πà‡∏≤‡∏ô‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏£‡∏≤‡∏ä‡∏Å‡∏≤‡∏£‡∏à‡∏≤‡∏Å‡∏†‡∏≤‡∏û‡∏ô‡∏µ‡πâ‡πÄ‡∏õ‡πá‡∏ô‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢",
        "images": [image_base64]
    }

    response = requests.post(url, json=payload, stream=True)
    response.raise_for_status()

    full_text = ""
    for line in response.iter_lines():
        if line:
            data = json.loads(line.decode("utf-8"))
            full_text += data.get("response", "")

    return full_text.strip()


In [None]:
def extract_input(text):

    text = re.sub(r'["\']?natural_text["\']?\s*:\s*', '', text)
    text = re.sub(r'[{}"]+', '', text)
    
    # ‡∏ï‡∏±‡∏î‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤ '‡πë.' ‡∏ó‡∏¥‡πâ‡∏á
    start = re.search(r"(‡πë\..+)", text, re.DOTALL)
    if not start:
        return ""
    text = start.group(1)

    # ‡∏ï‡∏±‡∏î‡πÄ‡∏°‡∏∑‡πà‡∏≠‡πÄ‡∏à‡∏≠ "‡∏à‡∏∂‡∏á‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏°‡∏≤‡πÄ‡∏û‡∏∑‡πà‡∏≠" ‡∏´‡∏£‡∏∑‡∏≠ "‡∏à‡∏∂‡∏á‡πÄ‡∏™‡∏ô‡∏≠‡∏°‡∏≤‡πÄ‡∏û‡∏∑‡πà‡∏≠"
    cutoff_match = re.search(r"(.*?)\b(‡∏à‡∏∂‡∏á(‡πÄ‡∏£‡∏µ‡∏¢‡∏ô|‡πÄ‡∏™‡∏ô‡∏≠)‡∏°‡∏≤‡πÄ‡∏û‡∏∑‡πà‡∏≠.*?)$", text, re.DOTALL)
    if cutoff_match:
        text = cutoff_match.group(1).strip()

    # ‡πÅ‡∏¢‡∏Å‡∏Ç‡πâ‡∏≠ ‡πë ‚Äì ‡πî ‡πÅ‡∏ö‡∏ö‡πÄ‡∏î‡∏¥‡∏°
    part1 = re.search(r"(‡πë\.\s*(‡∏î‡πâ‡∏ß‡∏¢|‡∏ï‡∏≤‡∏°‡∏≠‡πâ‡∏≤‡∏á‡∏ñ‡∏∂‡∏á).*?)(?=\n\s*‡πí\.|$)", text, re.DOTALL)
    part2 = re.search(r"(‡πí\..*?)(?=\n\s*‡πì\.|$)", text, re.DOTALL)
    part3 = re.search(r"(‡πì\..*?)(?=\n\s*‡πî\.|$)", text, re.DOTALL)
    part4 = re.search(r"(‡πî\..*)", text, re.DOTALL)

    parts = []
    if part1: parts.append(part1.group(1).strip())
    if part2: parts.append(part2.group(1).strip())
    if part3: parts.append(part3.group(1).strip())
    if part4: parts.append(part4.group(1).strip())

    # ‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏Ç‡πâ‡∏≠ ‡πë ‡πÅ‡∏•‡∏∞ ‡πí ‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ô‡πâ‡∏≠‡∏¢
    if len(parts) < 2:
        return ""

    return " ".join(parts)


In [None]:
def extract_output(text):
    # ‡∏•‡∏ö noise ‡πÄ‡∏ä‡πà‡∏ô "natural_text": ‡∏´‡∏£‡∏∑‡∏≠ JSON
    text = re.sub(r'["\']?natural_text["\']?\s*:\s*', '', text)
    text = re.sub(r'[{}"]+', '', text)

    # ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏à‡∏≤‡∏Å '‡πë.' ‡∏ó‡∏µ‡πà‡πÄ‡∏õ‡πá‡∏ô‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏à‡∏£‡∏¥‡∏á
    start_match = re.search(r"(‡πë\.\s*(‡∏î‡πâ‡∏ß‡∏¢|‡∏ï‡∏≤‡∏°‡∏≠‡πâ‡∏≤‡∏á‡∏ñ‡∏∂‡∏á).*?)", text, re.DOTALL)
    if not start_match:
        return ""

    text = text[start_match.start():]

    # ‡∏´‡∏¢‡∏∏‡∏î‡∏ó‡∏µ‡πà '‡πí.' ‡∏´‡∏£‡∏∑‡∏≠ '‡∏à‡∏∂‡∏á‡πÄ‡∏™‡∏ô‡∏≠‡∏°‡∏≤‡πÄ‡∏û‡∏∑‡πà‡∏≠' ‡∏´‡∏£‡∏∑‡∏≠ '‡∏à‡∏∂‡∏á‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏°‡∏≤‡πÄ‡∏û‡∏∑‡πà‡∏≠'
    stop_match = re.search(r"(.*?)\n\s*(‡πí\.\s*(‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ|‡∏ï‡∏≤‡∏°‡∏Ç‡πâ‡∏≠ ‡πë).*?)", text, re.DOTALL)
    if stop_match:
        return stop_match.group(1).strip()

    return text.strip()


In [6]:
pdf_dir = "./GovernmentDocs"
output_dir = "./ocr_json"
os.makedirs(output_dir, exist_ok=True)

In [None]:
from tqdm.notebook import tqdm

file_list = [f"{i:03}.pdf" for i in range(1000)]
file_list = [f for f in file_list if os.path.exists(os.path.join(pdf_dir, f))]

for pdf_name in tqdm(file_list, desc="üîÑ Processing PDF files", unit="file"):
    print(pdf_name)
    pdf_path = os.path.join(pdf_dir, pdf_name)
    output_json = os.path.join(output_dir, pdf_name.replace(".pdf", ".json"))

    try:
        # Page check
        with fitz.open(pdf_path) as doc:
            if doc.page_count != 2:
                print(f"‚ö†Ô∏è Skip {pdf_name} (has {doc.page_count} pages)")
                continue

        image_output = pdf_page_to_base64(pdf_path, 0)
        print(f" ‚úÖ The Outgoing letter of {pdf_name} has been converted to image")
        image_input = pdf_page_to_base64(pdf_path, 1)
        print(f" ‚úÖ The Incoming letter of {pdf_name} has been converted to image")

        output_text = ocr_with_ollama(image_output)
        print(f" ‚úÖ The Outgoing letter of {pdf_name} is done with text extraction")
        input_text = ocr_with_ollama(image_input)
        print(f" ‚úÖ The Incoming letter of {pdf_name} is done with text extraction")

        output_text = extract_output(output_text)
        print(f" ‚úÖ The Outgoing letter of {pdf_name} is done with text filtering")
        input_text = extract_input(input_text)
        print(f" ‚úÖ The Incoming letter of {pdf_name} is done with text filtering")

        result = {
            "file": pdf_name,
            "input_text": input_text,
            "output_text": output_text
        }

        with open(output_json, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        
        print(f"‚úÖ Saved: {output_json}")

    except Exception as e:
        print(f"‚ùå Error in {pdf_name}: {e}")


üîÑ Processing PDF files:   0%|          | 0/42 [00:00<?, ?file/s]

000.pdf
 ‚úÖ The Outgoing letter of 000.pdf has been converted to image
 ‚úÖ The Incoming letter of 000.pdf has been converted to image
 ‚úÖ The Outgoing letter of 000.pdf is done with text extraction
 ‚úÖ The Incoming letter of 000.pdf is done with text extraction
 ‚úÖ The Outgoing letter of 000.pdf is done with text filtering
 ‚úÖ The Incoming letter of 000.pdf is done with text filtering
‚úÖ Saved: ./ocr_json\000.json
001.pdf
 ‚úÖ The Outgoing letter of 001.pdf has been converted to image
 ‚úÖ The Incoming letter of 001.pdf has been converted to image
 ‚úÖ The Outgoing letter of 001.pdf is done with text extraction
 ‚úÖ The Incoming letter of 001.pdf is done with text extraction
 ‚úÖ The Outgoing letter of 001.pdf is done with text filtering
 ‚úÖ The Incoming letter of 001.pdf is done with text filtering
‚úÖ Saved: ./ocr_json\001.json
002.pdf
 ‚úÖ The Outgoing letter of 002.pdf has been converted to image
 ‚úÖ The Incoming letter of 002.pdf has been converted to image
 ‚úÖ The Outgo

In [None]:
# pdf_path = "./GovernmentDocs/000.pdf"

# image1 = pdf_page_to_base64(pdf_path, 1)  # ‡∏´‡∏ô‡∏±‡∏á‡∏™‡∏∑‡∏≠‡∏£‡∏±‡∏ö
# image0 = pdf_page_to_base64(pdf_path, 0)  # ‡∏´‡∏ô‡∏±‡∏á‡∏™‡∏∑‡∏≠‡∏™‡πà‡∏á

# print("üì§ Sending to Typhoon OCR (‡∏´‡∏ô‡πâ‡∏≤ 2)...")
# input_text = ocr_with_ollama(image1)

# print("üì§ Sending to Typhoon OCR (‡∏´‡∏ô‡πâ‡∏≤ 1)...")
# output_text = ocr_with_ollama(image0)

# print("üéØ Input:", input_text[:200])
# print("üéØ Output:", output_text[:200])
