In [1]:
import requests
import time
import io
import os
import re 
import pandas as pd
import jiwer
from pathlib import Path
from PIL import Image
from pdf2image import convert_from_bytes
from IPython.display import display, Markdown, HTML
from tqdm.notebook import tqdm

PDF_LETTER_DIR = Path("input_pdfs-Letter")
PDF_BOARD_DIR = Path("input_pdfs-Board")
GROUND_TRUTH_DIR = Path("ground_truth")
# GROUND_TRUTH_DIR = Path("ground_truth-Letter")
# GROUND_TRUTH_DIR = Path("ground_truth-Board")
TYPHOON_OCR_ENDPOINT = "http://typhoon-ocr:8000/process"

In [2]:
OCR_CORRECTION_MAP = {
    "‡∏ô‡∏®.‡∏™‡∏£‡∏ó.":"‡∏ô‡∏®.‡∏™‡∏ò‡∏ó.", "‡∏®‡∏ä‡∏ö":"‡∏®‡∏ã‡∏ö", "‡∏Å‡∏ß‡∏ñ.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ß‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏£‡∏Å.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏£‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏ò‡∏Å.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ò‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏ß‡∏Å.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ß‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏´‡∏Ñ.‡∏®‡∏ó‡∏ó.‡∏™‡∏™.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ó‡∏Ñ.‡∏®‡∏ó‡∏ó.‡∏™‡∏™.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏õ‡∏†.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏õ‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏Å.‡∏Å‡∏ò‡∏Å.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏´‡∏Å.‡∏Å‡∏ò‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏ß‡∏†.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ß‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏®‡∏ä.‡∏ó‡∏´‡∏≤‡∏£. ": "‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£ ", "‡∏Ñ‡∏∏‡∏ì‡∏ó.‡πñ‡πó": "‡∏Ñ‡∏Å‡∏ô‡∏ó.‡πñ‡πó", "‡∏™‡∏ô.‡∏û‡∏ô.‡∏ß‡∏™‡∏ó.‡∏™‡∏õ‡∏ó.": "‡∏™‡∏ô.‡∏ú‡∏ö.‡∏ß‡∏™‡∏ó.‡∏™‡∏õ‡∏ó.",
    "‡∏™‡∏ô.‡∏û‡∏ö.‡∏™‡∏õ‡∏ó.": "‡∏™‡∏ô.‡∏ú‡∏ö.‡∏™‡∏õ‡∏ó.", "‡∏Å‡∏ß‡∏ï.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ß‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏ô‡∏ä‡∏ï.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏ô‡∏Ç‡∏ï.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏ú‡∏≠.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏ú‡∏≠.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏®‡∏ä‡∏¢.‡∏™‡∏õ‡∏ó. ": "‡∏®‡∏®‡∏¢.‡∏™‡∏õ‡∏ó. ", "‡∏£‡∏≠‡∏á ‡∏ú‡∏≠.‡∏Å‡∏û‡∏®.‡∏®‡∏ä‡∏¢.‡∏™‡∏õ‡∏ó.": "‡∏£‡∏≠‡∏á ‡∏ú‡∏≠.‡∏Å‡∏†‡∏®.‡∏®‡∏®‡∏¢.‡∏™‡∏õ‡∏ó.",
    "‡∏™‡∏ö.‡∏ö‡∏Å.‡∏ó‡∏ó. ": "‡∏™‡∏ô.‡∏ö‡∏Å.‡∏ö‡∏Å.‡∏ó‡∏ó. ", "‡∏¢‡∏ô.‡∏ó‡∏´‡∏≤‡∏£": "‡∏¢‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏ö‡∏Å.‡∏ó‡∏´‡∏≤‡∏£": "‡∏ö‡∏Å.‡∏ó‡∏ó.", "‡∏™‡∏™‡∏Ñ.‡∏ö‡∏Å.‡∏ó‡∏ó.": "‡∏™‡∏•‡∏Å.‡∏ö‡∏Å.‡∏ó‡∏ó.",
    "‡∏™‡∏™‡∏†.‡∏ó‡∏´‡∏≤‡∏£": "‡∏™‡∏™‡∏Å.‡∏ó‡∏´‡∏≤‡∏£", "‡∏ä‡∏ß.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Ç‡∏ß.‡∏ó‡∏´‡∏≤‡∏£", "‡∏ô‡∏ó‡∏ü. ": "‡∏ô‡∏ó‡∏û.", "‡∏Å‡∏ß‡∏†.‡∏®‡∏ä.‡∏ô.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ß‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏®‡∏ä.‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏Å‡∏•.‡∏ô‡∏ä‡∏ä.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏Å‡∏•.‡∏ô‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏ô‡∏ä‡∏ä.‡∏ó‡∏´‡∏≤‡∏£": "‡∏ô‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏®‡∏ä‡∏•.‡∏ô‡∏ä‡∏ä.‡∏ó‡∏´‡∏≤‡∏£": "‡∏®‡∏ã‡∏•.‡∏ô‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏õ‡∏ä.‡∏®‡∏ä‡∏ö.‡∏™‡∏™‡∏ó.‡∏ó‡∏£. ": "‡∏Å‡∏õ‡∏ã.‡∏®‡∏ã‡∏ö.‡∏™‡∏™‡∏ó.‡∏ó‡∏£.","‡∏ñ‡∏ß‡∏Å.‡∏®‡∏ä‡∏•.‡∏ô‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ß‡∏Å.‡∏®‡∏ã‡∏•.‡∏ô‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", 
    "‡∏Å‡∏®‡∏ä.‡∏™‡∏®‡∏ó.‡∏™‡∏õ‡∏ó.": "‡∏Å‡∏®‡∏©.‡∏™‡∏®‡∏ó.‡∏™‡∏õ‡∏ó.", "‡πÄ‡∏™‡∏£.‡∏™‡∏õ‡∏ó.": "‡πÄ‡∏™‡∏ò.‡∏™‡∏õ‡∏ó.", "‡∏ô‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏ô‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏£‡∏ò.‡∏ä‡∏ô.‡∏ó‡∏´‡∏≤‡∏£": "‡∏£‡∏£.‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏™‡∏ô.‡∏ó‡∏´‡∏≤‡∏£": "‡∏™‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏™‡∏°.‡∏™‡∏ô.‡∏ó‡∏´‡∏≤‡∏£. ": "‡∏Å‡∏™‡∏ö.‡∏™‡∏ö.‡∏ó‡∏´‡∏≤‡∏£. ",
    "‡∏ô‡∏ä‡∏ï.‡∏®‡∏ä.‡∏ó‡∏´‡∏≤‡∏£": "‡∏ô‡∏Ç‡∏ï.‡∏ô‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏ß‡∏à.‡∏®‡∏ä‡∏ô.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ß‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏ú‡∏≠.‡∏®‡∏ä.‡∏õ‡∏ó‡∏´‡∏≤‡∏£": "‡∏ú‡∏≠.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏ô.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏®‡∏ï‡∏ñ. ": "‡∏®‡∏ï‡∏Å. ", "‡∏™‡∏Ñ‡∏ó.‡∏™‡∏õ‡∏ó.": "‡∏™‡∏®‡∏ó.‡∏™‡∏õ‡∏ó.", "‡∏Å‡∏£‡∏†.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ò‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏£‡∏£.‡∏£‡∏õ‡∏†.‡∏®‡∏ò. ": "‡∏£‡∏£.‡∏£‡∏õ‡∏†.‡∏®‡∏£‡∏†.", "‡∏ô‡∏ó‡∏ó.": "‡∏ô‡∏ó‡∏û.", "‡∏Å‡∏£‡∏°‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏£.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Ñ‡∏ä‡∏ä.‡∏ó‡∏´‡∏≤‡∏£": "‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏ñ‡∏ô‡∏ô‡∏ú‡∏à‡∏á‡∏û‡∏´‡∏≤‡∏£": "‡∏Å‡∏ô‡∏ú.‡∏Å‡∏£.‡∏ó‡∏´‡∏≤‡∏£", "‡∏™‡∏ß‡∏ú.‡∏¢‡∏Å.‡∏ó‡∏´‡∏≤‡∏£": "‡∏™‡∏ß‡∏ù.‡∏¢‡∏Å.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏´‡∏®.‡∏®‡∏™‡∏†.‡∏¢‡∏Å.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ù‡∏®.‡∏®‡∏™‡∏†.‡∏¢‡∏Å.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏´‡∏°.‡∏ô‡∏Å.‡∏™‡∏õ‡∏ó.": "‡∏Å‡∏ó‡∏î.‡∏ö‡∏Å.‡∏™‡∏õ‡∏ó.", "‡πÄ‡∏•‡∏Ç‡∏≤.‡∏™‡∏õ‡∏ó.": "‡πÄ‡∏™‡∏ò.‡∏™‡∏õ‡∏ó.", "‡∏ú‡∏≠.‡∏ö‡∏ó‡∏ß.‡∏™‡∏õ‡∏ó.": "‡∏ú‡∏≠.‡∏ö‡∏ë‡∏ß.‡∏™‡∏õ‡∏ó.",
    "‡∏Å‡∏à‡∏Å.‡∏™‡∏ô‡∏™. ‡∏Å‡∏°.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏à‡∏Å.‡∏™‡∏ö‡∏™.‡∏Å‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏™‡∏°.‡∏™‡∏ô.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏™‡∏ö.‡∏™‡∏ö.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏û‡∏®.‡∏®‡∏™‡∏†.‡∏¢‡∏Å.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ù‡∏®.‡∏®‡∏™‡∏†.‡∏¢‡∏Å.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏ñ‡∏ô‡∏ô‡∏ú.‡∏Å‡∏£.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ô‡∏ú.‡∏Å‡∏£.‡∏ó‡∏´‡∏≤‡∏£", "‡∏®‡∏ä‡∏ö.‡∏ó‡∏≠.": "‡∏®‡∏ã‡∏ö.‡∏ó‡∏≠.", "‡∏£‡∏£.‡∏£‡∏õ‡∏†.‡∏®‡∏ò.": "‡∏£‡∏£.‡∏£‡∏õ‡∏†.‡∏®‡∏£‡∏†.", "‡∏Å‡∏Ñ‡∏ä.‡∏ö‡∏Å.‡∏ô‡∏ó‡∏û.": "‡∏Å‡∏Å‡∏ä.‡∏ö‡∏Å.‡∏ô‡∏ó‡∏û.",
    "‡∏Å‡∏ö.‡∏™‡∏Ñ‡∏£.‡∏Å‡∏£.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏ö‡∏†.‡∏™‡∏Å‡∏£.‡∏Å‡∏£.‡∏ó‡∏´‡∏≤‡∏£", "‡∏Å‡∏´.‡∏≠‡∏ï‡πä‡∏≠‡∏î.‡πë‡πê.‡πë":"‡∏Å‡∏´ ‡πê‡πì‡πê‡πë.‡πë‡πê.‡πë","‡∏à‡∏∂‡∏á‡πÄ‡∏™‡∏ô‡∏≠‡∏°‡∏≤‡∏°‡∏≤‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏û‡∏¥‡∏à‡∏≤‡∏£‡∏ì‡∏≤":"‡∏à‡∏∂‡∏á‡πÄ‡∏™‡∏ô‡∏≠‡∏°‡∏≤‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏û‡∏¥‡∏à‡∏≤‡∏£‡∏ì‡∏≤",
    "‡πï‡πó‡πñ‡πì‡πô(‡πî‡πó).":"‡πï‡πó‡πí‡πë‡πó‡πî‡πó).","‡πê-‡πí‡πï‡πó‡πí.‡πë‡πó‡πî‡πó.":"‡πê ‡πí‡πï‡πó‡πí ‡πë‡πó‡πî‡πó","‡∏Å‡∏´.‡∏≠‡∏ï‡πä‡∏≠‡∏Å.‡πë‡πê.‡πë":"‡∏Å‡∏´ ‡πê‡πì‡πê‡πë.‡πë‡πê.‡πë","‡∏Å‡∏õ‡∏†.‡πì":"‡∏Å‡∏õ‡∏Å.‡πì","‡∏Å‡∏ò‡∏ñ.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£":"‡∏Å‡∏ò‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏£‡∏Å.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£":"‡∏Å‡∏ò‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£","‡∏ú‡∏ä.‡∏ú‡∏≠.‡∏Å‡∏£‡∏Å.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£":"‡∏ú‡∏ä.‡∏ú‡∏≠.‡∏Å‡∏£‡∏†.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£","‡∏Å‡∏ó.‡∏≠‡∏ï‡πä‡∏≠‡∏Å.‡πë":"‡∏Å‡∏´ ‡πê‡πì‡πê‡πë.‡πë‡πê.‡πë","‡∏ú‡∏ä.‡∏ú‡∏≠.‡∏Å‡∏£‡∏Å.‡∏®‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£":"‡∏ú‡∏ä.‡∏ú‡∏≠.‡∏Å‡∏£‡∏†.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏Å.‡∏Å‡∏ò‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£":"‡∏´‡∏Å.‡∏Å‡∏ò‡∏Å.‡∏®‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£","‡∏Å‡∏Å‡∏•.‡∏ô‡∏ä‡∏ö.‡∏ó‡∏´‡∏≤‡∏£":"‡∏Å‡∏Å‡∏•.‡∏ô‡∏ã‡∏ö.‡∏ó‡∏´‡∏≤‡∏£","‡πê.‡πí‡πí‡πó‡πï.‡πï‡πó‡πë‡πñ":"‡πê ‡πí‡πí‡πó‡πï ‡πï‡πó‡πë‡πñ","‡∏≠‡∏¥‡∏•‡∏•‡πå":"‡∏Æ‡∏¥‡∏•‡∏™‡πå","‡πÑ‡∏°‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏ä‡∏∑‡πà‡∏≠":"‡πÑ‡∏°‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏ä‡∏±‡πâ‡∏ô‡∏¢‡∏®",
    "‡∏ú‡∏≠.‡∏Å‡∏û‡∏®.‡∏®‡∏Ñ‡∏¢.‡∏™‡∏õ‡∏ó.":"‡∏ú‡∏≠.‡∏Å‡∏†‡∏®.‡∏®‡∏®‡∏¢.‡∏™‡∏õ‡∏ó.","‡∏ú‡∏≠.‡∏®‡∏Ñ‡∏¢.‡∏™‡∏õ‡∏ó.":"‡∏ú‡∏≠.‡∏®‡∏®‡∏¢.‡∏™‡∏õ‡∏ó.", "..‡∏™‡∏™‡∏ó.‡∏ó‡∏£.(‡∏®‡∏ã‡∏ö.‡πÇ‡∏ó‡∏£.‡πï‡πó‡πò‡πô)":"‡∏™‡∏™‡∏ó.‡∏ó‡∏£. (‡∏®‡∏ã‡∏ö. ‡πÇ‡∏ó‡∏£.‡πï‡πó‡πò‡πô‡πê)",
    "‡∏Ñ‡∏≤‡∏ô‡∏ë‡πå.‡πñ‡πó":"‡∏Ñ‡∏Å‡∏ô‡∏ó.‡πñ‡πó","‡∏™‡∏ô.‡∏û.‡∏ß‡∏™‡∏ó.‡∏™‡∏õ‡∏ó.":"‡∏™‡∏ô.‡∏ú‡∏ö.‡∏ß‡∏™‡∏ó.‡∏™‡∏õ‡∏ó.","‡∏™‡∏ô.‡∏ú.‡∏™‡∏õ‡∏ó.":"‡∏™‡∏ô.‡∏ú‡∏ö.‡∏™‡∏õ‡∏ó.","‡∏ú‡∏≠.‡∏Å‡∏û.‡∏ß‡∏™‡∏ó.‡∏™‡∏õ‡∏ó.":"‡∏ú‡∏≠.‡∏Å‡∏û‡∏ú.‡∏ß‡∏™‡∏ó.‡∏™‡∏õ‡∏ó.",
    "‡∏ô‡∏≤‡∏¢‡∏ó‡∏´‡∏≤‡∏£‡∏≠‡∏∏‡∏ö‡∏°‡∏¥‡∏ï‡∏¥‡∏Ç‡πà‡∏≤‡∏ß":"‡∏ô‡∏≤‡∏¢‡∏ó‡∏´‡∏≤‡∏£‡∏≠‡∏ô‡∏∏‡∏°‡∏±‡∏ï‡∏¥‡∏Ç‡πà‡∏≤‡∏ß","‡∏Å‡∏£‡∏∞‡∏î‡∏≤‡∏©‡πÄ‡∏ä‡∏¥‡∏ç‡∏Ç‡πà‡∏≤‡∏ß‡∏£‡πà‡∏ß‡∏° (‡∏ó‡∏ó.)":"‡∏Å‡∏£‡∏∞‡∏î‡∏≤‡∏©‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô‡∏Ç‡πà‡∏≤‡∏ß‡∏£‡πà‡∏ß‡∏° (‡∏ó‡∏ó.)",
    "‡∏à‡∏∂‡∏á‡πÄ‡∏™‡∏ô‡∏≠‡∏°‡∏≤‡∏£‡∏∞‡∏ö‡∏Å‡∏ß‡∏ô‡πÇ‡∏õ‡∏£‡∏î":"‡∏à‡∏∂‡∏á‡πÄ‡∏™‡∏ô‡∏≠‡∏°‡∏≤‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÇ‡∏õ‡∏£‡∏î", "‡∏•‡∏≤‡∏≠‡∏â‡∏Å.": "‡∏•‡∏≤‡∏≠‡∏≠‡∏Å", "0":"‡πê", "1":"‡πë", "2":"‡πí", "3":"‡πì", "4":"‡πî", "5":"‡πï", "6":"‡πñ", "7":"‡πó", "8":"‡πò", "9":"‡πô"
}

def post_process_ocr(ocr_text: str) -> str:
    if not ocr_text or not isinstance(ocr_text, str):
        return ""
    processed_text = ocr_text
    sorted_correction_keys = sorted(OCR_CORRECTION_MAP.keys(), key=len, reverse=True)
    for wrong_word_key in sorted_correction_keys:
        correct_word_value = OCR_CORRECTION_MAP[wrong_word_key]
        processed_text = processed_text.replace(wrong_word_key, correct_word_value)
        processed_text = re.sub(r'\s+', ' ', processed_text).strip()
        processed_text = re.sub(r'\s+\.', '.', processed_text)
        processed_text = processed_text.replace('\n', ' ')
        processed_text = processed_text.replace("‚Äú", "\"").replace("‚Äù", "\"")
        processed_text = processed_text.replace("‚Äò", "'").replace("‚Äô", "'")
        processed_text = re.sub(r'\-{3,}', '', processed_text)
        processed_text = re.sub(r'\*{2,}', '', processed_text)
        processed_text = re.sub(r'\s- ', '', processed_text)
        processed_text = processed_text.replace('#', '')
        processed_text = processed_text.replace('|', '')
    return processed_text

def pdf_to_images(pdf_bytes, dpi=300):
    try:
        return convert_from_bytes(pdf_bytes, dpi=dpi)
    except Exception as e:
        print(f"‡πÄ‡∏Å‡∏¥‡∏î‡∏Ç‡πâ‡∏≠‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡πÅ‡∏õ‡∏•‡∏á PDF: {e}")
        return []

def run_typhoon_ocr(pdf_path: Path):
    try:
        with open(pdf_path, "rb") as f:
            pdf_bytes = f.read()
    except FileNotFoundError:
        return {"error": f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå: {pdf_path}"}, 0

    images = pdf_to_images(pdf_bytes)
    if not images:
        return {"error": "‡∏Å‡∏≤‡∏£‡πÅ‡∏õ‡∏•‡∏á PDF ‡πÄ‡∏õ‡πá‡∏ô‡∏£‡∏π‡∏õ‡∏†‡∏≤‡∏û‡∏•‡πâ‡∏°‡πÄ‡∏´‡∏•‡∏ß"}, 0

    full_text = []
    start_time = time.time()
    for i, img in enumerate(images):
        img_byte_arr = io.BytesIO()
        img.save(img_byte_arr, format='PNG')
        files = {'file': (f'page_{i+1}.png', img_byte_arr.getvalue(), 'image/png')}
        try:
            response = requests.post(TYPHOON_OCR_ENDPOINT, files=files, timeout=180)
            response.raise_for_status()
            ocr_result = response.json()
            
            result_text = ocr_result.get("result")
            if result_text and isinstance(result_text, str) and result_text.strip():
                full_text.append(result_text.strip())
            else:
                error_msg = f"[ERROR on Page {i+1}: API did not return valid text in 'result' key. Response: {ocr_result}]"
                print(f"     - {error_msg}")
                full_text.append(error_msg) 

        except requests.exceptions.RequestException as e:
            return {"error": f"API Connection Error: {e}"}, 0
        except ValueError:
            return {"error": f"API did not return valid JSON. Response: {response.text}"}, 0
            
    exec_time = time.time() - start_time
    separator = "\n\n" + "="*20 + " END OF PAGE " + "="*20 + "\n\n"
    final_text = separator.join(full_text)
    
    return {"text": final_text, "page_count": len(images)}, exec_time


def calculate_metrics(hypothesis: str, reference: str):
    if not reference or not reference.strip():
        return {"wer": None, "cer": None}
    
    transformation = jiwer.Compose([
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
    ])
    
    try:
        reference_clean = transformation(reference)
        hypothesis_clean = transformation(hypothesis)
        
        wer = jiwer.wer(reference_clean, hypothesis_clean)
        cer = jiwer.cer(reference_clean, hypothesis_clean)
        return {"wer": wer, "cer": cer}
        
    except Exception as e:
        return {"wer": None, "cer": None}

def safe_percent_format(val):
    if isinstance(val, (int, float)) and pd.notna(val):
        return f"{val:.2%}"
    return "N/A"

## Internal Letter

In [3]:
# Focus on Raw OCR-Internal Letter

pdf_files = sorted(list(PDF_LETTER_DIR.glob("*.pdf")))

if not pdf_files:
    display(Markdown(f"<font color='red'>**‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå PDF ‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå `{PDF_LETTER_DIR}` ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤ Path ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á**</font>"))
else:
    display(Markdown(f"### üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏£‡∏∞‡∏ö‡∏ö OCR ‡∏Å‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î {len(pdf_files)} ‡πÑ‡∏ü‡∏•‡πå"))
    
    results_list = []
    
    for pdf_path in tqdm(pdf_files, desc="‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÑ‡∏ü‡∏•‡πå"):

        # run OCR
        ocr_result, exec_time = run_typhoon_ocr(pdf_path)
        raw_ocr_text = ocr_result.get("text", "")
        
        # load Ground Truth
        pdf_stem = pdf_path.stem
        gt_path = GROUND_TRUTH_DIR / f"{pdf_stem}.txt"
        print(f"   - PDF Stem: '{pdf_stem}'")
        print(f"   - ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≠‡∏á‡∏´‡∏≤ Ground Truth ‡∏ó‡∏µ‡πà: '{gt_path}'")
        
        ground_truth_text = ""
        if gt_path.is_file():
            print(f"   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!")
            try:
                ground_truth_text = gt_path.read_text(encoding="utf-8").strip()
                if ground_truth_text:
                    print(f"     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ {len(ground_truth_text)} ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£")
                else:
                    print(f"     - WARNING: ‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡πÅ‡∏ï‡πà‡πÑ‡∏ü‡∏•‡πå‡∏ß‡πà‡∏≤‡∏á‡πÄ‡∏õ‡∏•‡πà‡∏≤!")
            except Exception as e:
                print(f"     - ‚ùå ERROR: ‡πÄ‡∏Å‡∏¥‡∏î‡∏Ç‡πâ‡∏≠‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå: {e}")
        else:
            print(f"   - ‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth")
        
        # Metrics Calculation
        metrics = calculate_metrics(raw_ocr_text, ground_truth_text)

        results_list.append({
            "File": pdf_path.name,
            "Time (s)": exec_time,
            "WER": metrics.get('wer'), 
            "CER": metrics.get('cer'), 
            "Raw OCR Text": raw_ocr_text,
            "Ground Truth Text": ground_truth_text 
        })
        

    df_results = pd.DataFrame(results_list)

    df_results["WAcc"] = 1 - df_results["WER"].fillna(1) # ‡∏ñ‡πâ‡∏≤ WER ‡πÄ‡∏õ‡πá‡∏ô None ‡πÉ‡∏´‡πâ‡∏ñ‡∏∑‡∏≠‡∏ß‡πà‡∏≤ Error 100%
    df_results["CAcc"] = 1 - df_results["CER"].fillna(1) # ‡∏ñ‡πâ‡∏≤ CER ‡πÄ‡∏õ‡πá‡∏ô None ‡πÉ‡∏´‡πâ‡∏ñ‡∏∑‡∏≠‡∏ß‡πà‡∏≤ Error 100%
    display(Markdown(f"<hr>"))
    display(Markdown(f"### üìä ‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö (Raw OCR Accuracy)"))

    display_cols = ["File", "Time (s)", "CER", "WER", "CAcc", "WAcc"]

    def safe_percent_format(val):
        if isinstance(val, (int, float)) and pd.notna(val):
            return f"{val:.2%}"
        return "N/A"

    has_numeric_metrics = pd.to_numeric(df_results["CER"], errors='coerce').notna().any()

    styler = df_results[display_cols].style.format({
        "Time (s)": "{:.2f}",
        "CER": safe_percent_format,
        "WER": safe_percent_format,
        "CAcc": safe_percent_format,
        "WAcc": safe_percent_format,
    }).set_properties(**{'text-align': 'left'}).set_table_styles(
        [dict(selector='th', props=[('text-align', 'left')])]
    )

    if has_numeric_metrics:
        styler.background_gradient(cmap='RdYlGn_r', subset=["CER", "WER"])
        styler.background_gradient(cmap='YlGn', subset=["CAcc", "WAcc"])
    
    display(styler)
    display(Markdown(f"### üìà ‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏£‡∏ß‡∏°"))
    
    avg_metrics = df_results[["Time (s)", "CAcc", "WAcc"]].mean()

    display(Markdown(f"- **‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['Time (s)']:.2f}` ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ/‡πÑ‡∏ü‡∏•‡πå"))
    if pd.notna(avg_metrics['CAcc']):
        display(Markdown(f"- **Character Accuracy (CAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['CAcc']:.2%}`"))
        display(Markdown(f"- **Word Accuracy (WAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['WAcc']:.2%}`"))
    else:
        display(Markdown(f"- **Accuracy:** `N/A (‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• Ground Truth)`"))

    # OCR Result Display
    display(Markdown(f"<hr>"))
    display(Markdown(f"### üìù ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö (Raw OCR vs Ground Truth)"))
    
    for _, row in df_results.iterrows():
        display(Markdown(f"---"))
        display(Markdown(f"#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `{row['File']}`"))

        html_output = f"""
        <details>
            <summary>‡∏Ñ‡∏•‡∏¥‡∏Å‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏î‡∏π‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå Raw OCR</summary>
            <div style="background-color:#f7f7f7; border:1px solid #ddd; padding:10px; margin-top:10px; white-space: pre-wrap; word-wrap: break-word;">{row['Raw OCR Text']}</div>
        </details>
        <details>
            <summary>‡∏Ñ‡∏•‡∏¥‡∏Å‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏î‡∏π Ground Truth</summary>
            <div style="background-color:#e6f7ff; border:1px solid #b3e0ff; padding:10px; margin-top:5px; white-space: pre-wrap; word-wrap: break-word;">{row['Ground Truth Text']}</div>
        </details>
        """
        display(HTML(html_output))

### üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏£‡∏∞‡∏ö‡∏ö OCR ‡∏Å‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î 10 ‡πÑ‡∏ü‡∏•‡πå

‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÑ‡∏ü‡∏•‡πå:   0%|          | 0/10 [00:00<?, ?it/s]

   - PDF Stem: '000-2'
   - ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≠‡∏á‡∏´‡∏≤ Ground Truth ‡∏ó‡∏µ‡πà: 'ground_truth/000-2.txt'
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 869 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - PDF Stem: '001-2'
   - ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≠‡∏á‡∏´‡∏≤ Ground Truth ‡∏ó‡∏µ‡πà: 'ground_truth/001-2.txt'
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 773 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - PDF Stem: '003-2'
   - ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≠‡∏á‡∏´‡∏≤ Ground Truth ‡∏ó‡∏µ‡πà: 'ground_truth/003-2.txt'
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1127 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - PDF Stem: '005-2'
   - ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≠‡∏á‡∏´‡∏≤ Ground Truth ‡∏ó‡∏µ‡πà: 'ground_truth/005-2.txt'
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1338 ‡∏ï‡∏±‡

<hr>

### üìä ‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö (Raw OCR Accuracy)

Unnamed: 0,File,Time (s),CER,WER,CAcc,WAcc
0,000-2.pdf,7.64,4.26%,24.59%,95.74%,75.41%
1,001-2.pdf,7.16,6.77%,42.00%,93.23%,58.00%
2,003-2.pdf,10.16,11.57%,34.07%,88.43%,65.93%
3,005-2.pdf,11.62,2.59%,39.51%,97.41%,60.49%
4,008-2.pdf,6.18,4.63%,30.00%,95.37%,70.00%
5,009-2.pdf,14.45,6.93%,24.40%,93.07%,75.60%
6,010-2.pdf,5.4,3.52%,25.00%,96.48%,75.00%
7,012-2.pdf,6.75,12.10%,53.62%,87.90%,46.38%
8,013-2.pdf,7.21,5.37%,33.93%,94.63%,66.07%
9,015-2.pdf,8.47,6.09%,40.91%,93.91%,59.09%


### üìà ‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏£‡∏ß‡∏°

- **‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `8.50` ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ/‡πÑ‡∏ü‡∏•‡πå

- **Character Accuracy (CAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `93.62%`

- **Word Accuracy (WAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `65.20%`

<hr>

### üìù ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö (Raw OCR vs Ground Truth)

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `000-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `001-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `003-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `005-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `008-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `009-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `010-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `012-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `013-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `015-2.pdf`

In [4]:
# Focus on Processed OCR-Internal Letter

pdf_files = sorted(list(PDF_LETTER_DIR.glob("*.pdf")))

if not pdf_files:
    display(Markdown(f"<font color='red'>**‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå PDF ‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå `{PDF_LETTER_DIR}` ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤ Path ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á**</font>"))
else:
    display(Markdown(f"### üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏£‡∏∞‡∏ö‡∏ö OCR ‡∏Å‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î {len(pdf_files)} ‡πÑ‡∏ü‡∏•‡πå"))
    
    results_list = []
    
    for pdf_path in tqdm(pdf_files, desc="‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÑ‡∏ü‡∏•‡πå"):

        # run OCR
        ocr_result, exec_time = run_typhoon_ocr(pdf_path)
        raw_ocr_text = ocr_result.get("text", "")
        
        # do Post-processing
        processed_ocr_text = post_process_ocr(raw_ocr_text)
        
        # load Ground truth
        pdf_stem = pdf_path.stem
        gt_path = GROUND_TRUTH_DIR / f"{pdf_stem}.txt"
        
        ground_truth_text = ""
        if gt_path.is_file(): 
            print(f"   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!")
            try:
                try:
                    ground_truth_text = gt_path.read_text(encoding="utf-8").strip()
                except UnicodeDecodeError:
                    print("     - WARNING: ‡πÑ‡∏°‡πà‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡∏≠‡πà‡∏≤‡∏ô‡πÅ‡∏ö‡∏ö utf-8 ‡πÑ‡∏î‡πâ ‡∏•‡∏≠‡∏á‡∏≠‡πà‡∏≤‡∏ô‡πÅ‡∏ö‡∏ö TIS-620")
                    ground_truth_text = gt_path.read_text(encoding="tis-620").strip()

                if ground_truth_text:
                    print(f"     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ {len(ground_truth_text)} ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£")
                else:
                    print(f"     - WARNING: ‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡πÅ‡∏ï‡πà‡πÑ‡∏ü‡∏•‡πå‡∏ß‡πà‡∏≤‡∏á‡πÄ‡∏õ‡∏•‡πà‡∏≤!")
            except Exception as e:
                print(f"     - ‚ùå ERROR: ‡πÄ‡∏Å‡∏¥‡∏î‡∏Ç‡πâ‡∏≠‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå: {e}")
        else:
            print(f"   - ‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth ‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡πÑ‡∏ü‡∏•‡πå‡∏õ‡∏Å‡∏ï‡∏¥ (‡∏≠‡∏≤‡∏à‡πÄ‡∏õ‡πá‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå?)")
        
        # Metrics calculation
        metrics_before = calculate_metrics(raw_ocr_text, ground_truth_text)
        metrics_after = calculate_metrics(processed_ocr_text, ground_truth_text)
        
        results_list.append({
            "File": pdf_path.name,
            "Time (s)": exec_time,
            "WER (Before)": metrics_before['wer'],
            "CER (Before)": metrics_before['cer'],
            "WER (After)": metrics_after['wer'],
            "CER (After)": metrics_after['cer'],
            "Raw OCR Text": raw_ocr_text,
            "Processed OCR Text": processed_ocr_text,
            "Ground Truth Text": ground_truth_text
        })
        
    df_results = pd.DataFrame(results_list)
    display(Markdown(f"<hr>"))
    display(Markdown(f"### üìä ‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö"))
    df_results["WAcc (After)"] = 1 - df_results["WER (After)"].fillna(0)
    df_results["CAcc (After)"] = 1 - df_results["CER (After)"].fillna(0)

    display_cols = [
        "File", "Time (s)", 
        "CER (After)", "WER (After)", 
        "CAcc (After)", "WAcc (After)"
    ]

    has_numeric_metrics = pd.to_numeric(df_results["CER (After)"], errors='coerce').notna().any()

    # Format Data 
    styler = df_results[display_cols].style.format({
        "Time (s)": "{:.2f}",
        "CER (After)": safe_percent_format,
        "WER (After)": safe_percent_format,
        "CAcc (After)": safe_percent_format, 
        "WAcc (After)": safe_percent_format, 
    }).background_gradient(
        cmap='RdYlGn_r', 
        subset=["CER (After)", "WER (After)"]
    ).background_gradient(
        cmap='YlGn', 
        subset=["CAcc (After)", "WAcc (After)"]
    ).set_properties(**{'text-align': 'left'}).set_table_styles(
        [dict(selector='th', props=[('text-align', 'left')])]
    )

    display(styler)
    display(Markdown(f"### üìà ‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏£‡∏ß‡∏°"))

    avg_metrics = df_results[display_cols[1:]].mean() 
    display(Markdown(f"- **‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['Time (s)']:.2f}` ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ/‡πÑ‡∏ü‡∏•‡πå"))

    if pd.notna(avg_metrics['CAcc (After)']):
        display(Markdown(f"- **Character Accuracy (CAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['CAcc (After)']:.2%}` (‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£)"))
        display(Markdown(f"- **Word Accuracy (WAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['WAcc (After)']:.2%}` (‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏Ñ‡∏≥)"))
    else:
        display(Markdown(f"- **Accuracy:** `N/A (‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• Ground Truth)`"))
        
    # OCR Test Display
    display(Markdown(f"<hr>"))
    display(Markdown(f"### üìù ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå OCR ‡∏â‡∏ö‡∏±‡∏ö‡πÄ‡∏ï‡πá‡∏° (‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡πà‡∏≠‡∏ô-‡∏´‡∏•‡∏±‡∏á Post-processing)"))

    for index, row in df_results.iterrows():
        display(Markdown(f"---"))
        display(Markdown(f"#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `{row['File']}`"))

        # Expander for Raw OCR Text
        html_raw = f"""
        <details>
            <summary>‡∏Ñ‡∏•‡∏¥‡∏Å‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏î‡∏π‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå OCR (Processed Text)</summary>
            <div style="background-color:#f7f7f7; border:1px solid #ddd; padding:10px; margin-top:10px; white-space: pre-wrap; word-wrap: break-word;">{row['Processed OCR Text']}</div>
        </details>
        """
        display(HTML(html_raw))

        # Expander for Processed Text
        html_processed = f"""
        <details>
            <summary>‡∏Ñ‡∏•‡∏¥‡∏Å‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏î‡∏π Ground Truth Text</summary>
            <div style="background-color:#e6ffed; border:1px solid #b7e1cd; padding:10px; margin-top:5px; white-space: pre-wrap; word-wrap: break-word;">{row['Ground Truth Text']}</div>
        </details>
        """
        display(HTML(html_processed))

### üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏£‡∏∞‡∏ö‡∏ö OCR ‡∏Å‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î 10 ‡πÑ‡∏ü‡∏•‡πå

‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÑ‡∏ü‡∏•‡πå:   0%|          | 0/10 [00:00<?, ?it/s]

   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 869 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 773 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1127 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1338 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 846 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1836 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 726 

<hr>

### üìä ‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö

Unnamed: 0,File,Time (s),CER (After),WER (After),CAcc (After),WAcc (After)
0,000-2.pdf,7.85,1.83%,18.03%,98.17%,81.97%
1,001-2.pdf,5.17,6.49%,40.00%,93.51%,60.00%
2,003-2.pdf,10.11,10.34%,28.57%,89.66%,71.43%
3,005-2.pdf,9.26,3.06%,33.33%,96.94%,66.67%
4,008-2.pdf,5.74,0.50%,5.00%,99.50%,95.00%
5,009-2.pdf,11.16,6.93%,27.38%,93.07%,72.62%
6,010-2.pdf,6.15,0.44%,5.77%,99.56%,94.23%
7,012-2.pdf,6.36,8.98%,46.38%,91.02%,53.62%
8,013-2.pdf,9.31,3.68%,17.86%,96.32%,82.14%
9,015-2.pdf,9.08,5.25%,39.77%,94.75%,60.23%


### üìà ‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏£‡∏ß‡∏°

- **‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `8.02` ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ/‡πÑ‡∏ü‡∏•‡πå

- **Character Accuracy (CAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `95.25%` (‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£)

- **Word Accuracy (WAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `73.79%` (‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏Ñ‡∏≥)

<hr>

### üìù ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå OCR ‡∏â‡∏ö‡∏±‡∏ö‡πÄ‡∏ï‡πá‡∏° (‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡πà‡∏≠‡∏ô-‡∏´‡∏•‡∏±‡∏á Post-processing)

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `000-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `001-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `003-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `005-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `008-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `009-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `010-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `012-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `013-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `015-2.pdf`

## RTARF Memo

In [5]:
# Focus on Raw OCR-Public Board

pdf_files = sorted(list(PDF_BOARD_DIR.glob("*.pdf")))

if not pdf_files:
    display(Markdown(f"<font color='red'>**‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå PDF ‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå `{PDF_BOARD_DIR}` ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤ Path ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á**</font>"))
else:
    display(Markdown(f"### üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏£‡∏∞‡∏ö‡∏ö OCR ‡∏Å‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î {len(pdf_files)} ‡πÑ‡∏ü‡∏•‡πå"))
    
    results_list = []
    
    for pdf_path in tqdm(pdf_files, desc="‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÑ‡∏ü‡∏•‡πå"):

        # run OCR
        ocr_result, exec_time = run_typhoon_ocr(pdf_path)
        raw_ocr_text = ocr_result.get("text", "")
        
        # load Ground Truth
        pdf_stem = pdf_path.stem
        gt_path = GROUND_TRUTH_DIR / f"{pdf_stem}.txt"
        print(f"   - PDF Stem: '{pdf_stem}'")
        print(f"   - ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≠‡∏á‡∏´‡∏≤ Ground Truth ‡∏ó‡∏µ‡πà: '{gt_path}'")
        
        ground_truth_text = ""
        if gt_path.is_file():
            print(f"   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!")
            try:
                ground_truth_text = gt_path.read_text(encoding="utf-8").strip()
                if ground_truth_text:
                    print(f"     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ {len(ground_truth_text)} ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£")
                else:
                    print(f"     - WARNING: ‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡πÅ‡∏ï‡πà‡πÑ‡∏ü‡∏•‡πå‡∏ß‡πà‡∏≤‡∏á‡πÄ‡∏õ‡∏•‡πà‡∏≤!")
            except Exception as e:
                print(f"     - ‚ùå ERROR: ‡πÄ‡∏Å‡∏¥‡∏î‡∏Ç‡πâ‡∏≠‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå: {e}")
        else:
            print(f"   - ‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth")
        
        # Metrics Calculation
        metrics = calculate_metrics(raw_ocr_text, ground_truth_text)

        results_list.append({
            "File": pdf_path.name,
            "Time (s)": exec_time,
            "WER": metrics.get('wer'), 
            "CER": metrics.get('cer'), 
            "Raw OCR Text": raw_ocr_text,
            "Ground Truth Text": ground_truth_text 
        })
        

    df_results = pd.DataFrame(results_list)

    df_results["WAcc"] = 1 - df_results["WER"].fillna(1) # ‡∏ñ‡πâ‡∏≤ WER ‡πÄ‡∏õ‡πá‡∏ô None ‡πÉ‡∏´‡πâ‡∏ñ‡∏∑‡∏≠‡∏ß‡πà‡∏≤ Error 100%
    df_results["CAcc"] = 1 - df_results["CER"].fillna(1) # ‡∏ñ‡πâ‡∏≤ CER ‡πÄ‡∏õ‡πá‡∏ô None ‡πÉ‡∏´‡πâ‡∏ñ‡∏∑‡∏≠‡∏ß‡πà‡∏≤ Error 100%
    display(Markdown(f"<hr>"))
    display(Markdown(f"### üìä ‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö (Raw OCR Accuracy)"))

    display_cols = ["File", "Time (s)", "CER", "WER", "CAcc", "WAcc"]

    def safe_percent_format(val):
        if isinstance(val, (int, float)) and pd.notna(val):
            return f"{val:.2%}"
        return "N/A"

    has_numeric_metrics = pd.to_numeric(df_results["CER"], errors='coerce').notna().any()

    styler = df_results[display_cols].style.format({
        "Time (s)": "{:.2f}",
        "CER": safe_percent_format,
        "WER": safe_percent_format,
        "CAcc": safe_percent_format,
        "WAcc": safe_percent_format,
    }).set_properties(**{'text-align': 'left'}).set_table_styles(
        [dict(selector='th', props=[('text-align', 'left')])]
    )

    if has_numeric_metrics:
        styler.background_gradient(cmap='RdYlGn_r', subset=["CER", "WER"])
        styler.background_gradient(cmap='YlGn', subset=["CAcc", "WAcc"])
    
    display(styler)
    display(Markdown(f"### üìà ‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏£‡∏ß‡∏°"))
    
    avg_metrics = df_results[["Time (s)", "CAcc", "WAcc"]].mean()

    display(Markdown(f"- **‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['Time (s)']:.2f}` ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ/‡πÑ‡∏ü‡∏•‡πå"))
    if pd.notna(avg_metrics['CAcc']):
        display(Markdown(f"- **Character Accuracy (CAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['CAcc']:.2%}`"))
        display(Markdown(f"- **Word Accuracy (WAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['WAcc']:.2%}`"))
    else:
        display(Markdown(f"- **Accuracy:** `N/A (‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• Ground Truth)`"))

    # OCR Result Display
    display(Markdown(f"<hr>"))
    display(Markdown(f"### üìù ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö (Raw OCR vs Ground Truth)"))
    
    for _, row in df_results.iterrows():
        display(Markdown(f"---"))
        display(Markdown(f"#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `{row['File']}`"))

        html_output = f"""
        <details>
            <summary>‡∏Ñ‡∏•‡∏¥‡∏Å‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏î‡∏π‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå Raw OCR</summary>
            <div style="background-color:#f7f7f7; border:1px solid #ddd; padding:10px; margin-top:10px; white-space: pre-wrap; word-wrap: break-word;">{row['Raw OCR Text']}</div>
        </details>
        <details>
            <summary>‡∏Ñ‡∏•‡∏¥‡∏Å‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏î‡∏π Ground Truth</summary>
            <div style="background-color:#e6f7ff; border:1px solid #b3e0ff; padding:10px; margin-top:5px; white-space: pre-wrap; word-wrap: break-word;">{row['Ground Truth Text']}</div>
        </details>
        """
        display(HTML(html_output))

### üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏£‡∏∞‡∏ö‡∏ö OCR ‡∏Å‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î 10 ‡πÑ‡∏ü‡∏•‡πå

‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÑ‡∏ü‡∏•‡πå:   0%|          | 0/10 [00:00<?, ?it/s]

   - PDF Stem: '002-2'
   - ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≠‡∏á‡∏´‡∏≤ Ground Truth ‡∏ó‡∏µ‡πà: 'ground_truth/002-2.txt'
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1537 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - PDF Stem: '004-2'
   - ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≠‡∏á‡∏´‡∏≤ Ground Truth ‡∏ó‡∏µ‡πà: 'ground_truth/004-2.txt'
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1468 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - PDF Stem: '006-2'
   - ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≠‡∏á‡∏´‡∏≤ Ground Truth ‡∏ó‡∏µ‡πà: 'ground_truth/006-2.txt'
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1564 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - PDF Stem: '007-2'
   - ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≠‡∏á‡∏´‡∏≤ Ground Truth ‡∏ó‡∏µ‡πà: 'ground_truth/007-2.txt'
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 2846 ‡∏ï‡∏

<hr>

### üìä ‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö (Raw OCR Accuracy)

Unnamed: 0,File,Time (s),CER,WER,CAcc,WAcc
0,002-2.pdf,13.09,23.68%,67.02%,76.32%,32.98%
1,004-2.pdf,14.1,15.84%,58.82%,84.16%,41.18%
2,006-2.pdf,11.44,12.88%,41.94%,87.12%,58.06%
3,007-2.pdf,17.28,2.13%,15.59%,97.87%,84.41%
4,014-2.pdf,10.77,37.96%,56.59%,62.04%,43.41%
5,016-2.pdf,12.44,21.67%,74.47%,78.33%,25.53%
6,017-2.pdf,7.99,71.00%,83.33%,29.00%,16.67%
7,018-2.pdf,5.78,56.31%,65.38%,43.69%,34.62%
8,019-2.pdf,14.07,16.13%,51.97%,83.87%,48.03%
9,021-2.pdf,15.22,11.18%,37.31%,88.82%,62.69%


### üìà ‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏£‡∏ß‡∏°

- **‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `12.22` ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ/‡πÑ‡∏ü‡∏•‡πå

- **Character Accuracy (CAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `73.12%`

- **Word Accuracy (WAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `44.76%`

<hr>

### üìù ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö (Raw OCR vs Ground Truth)

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `002-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `004-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `006-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `007-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `014-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `016-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `017-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `018-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `019-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `021-2.pdf`

In [6]:
# Focus on Processed OCR-Public Board

pdf_files = sorted(list(PDF_BOARD_DIR.glob("*.pdf")))

if not pdf_files:
    display(Markdown(f"<font color='red'>**‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå PDF ‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå `{PDF_BOARD_DIR}` ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤ Path ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á**</font>"))
else:
    display(Markdown(f"### üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏£‡∏∞‡∏ö‡∏ö OCR ‡∏Å‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î {len(pdf_files)} ‡πÑ‡∏ü‡∏•‡πå"))
    
    results_list = []
    
    for pdf_path in tqdm(pdf_files, desc="‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÑ‡∏ü‡∏•‡πå"):

        # run OCR
        ocr_result, exec_time = run_typhoon_ocr(pdf_path)
        raw_ocr_text = ocr_result.get("text", "")
        
        # do Post-processing
        processed_ocr_text = post_process_ocr(raw_ocr_text)
        
        # load Ground truth
        pdf_stem = pdf_path.stem
        gt_path = GROUND_TRUTH_DIR / f"{pdf_stem}.txt"
        
        ground_truth_text = ""
        if gt_path.is_file(): 
            print(f"   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!")
            try:
                try:
                    ground_truth_text = gt_path.read_text(encoding="utf-8").strip()
                except UnicodeDecodeError:
                    print("     - WARNING: ‡πÑ‡∏°‡πà‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡∏≠‡πà‡∏≤‡∏ô‡πÅ‡∏ö‡∏ö utf-8 ‡πÑ‡∏î‡πâ ‡∏•‡∏≠‡∏á‡∏≠‡πà‡∏≤‡∏ô‡πÅ‡∏ö‡∏ö TIS-620")
                    ground_truth_text = gt_path.read_text(encoding="tis-620").strip()

                if ground_truth_text:
                    print(f"     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ {len(ground_truth_text)} ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£")
                else:
                    print(f"     - WARNING: ‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡πÅ‡∏ï‡πà‡πÑ‡∏ü‡∏•‡πå‡∏ß‡πà‡∏≤‡∏á‡πÄ‡∏õ‡∏•‡πà‡∏≤!")
            except Exception as e:
                print(f"     - ‚ùå ERROR: ‡πÄ‡∏Å‡∏¥‡∏î‡∏Ç‡πâ‡∏≠‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå: {e}")
        else:
            print(f"   - ‚ùå ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth ‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡πÑ‡∏ü‡∏•‡πå‡∏õ‡∏Å‡∏ï‡∏¥ (‡∏≠‡∏≤‡∏à‡πÄ‡∏õ‡πá‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå?)")
        
        # Metrics calculation
        metrics_before = calculate_metrics(raw_ocr_text, ground_truth_text)
        metrics_after = calculate_metrics(processed_ocr_text, ground_truth_text)
        
        results_list.append({
            "File": pdf_path.name,
            "Time (s)": exec_time,
            "WER (Before)": metrics_before['wer'],
            "CER (Before)": metrics_before['cer'],
            "WER (After)": metrics_after['wer'],
            "CER (After)": metrics_after['cer'],
            "Raw OCR Text": raw_ocr_text,
            "Processed OCR Text": processed_ocr_text,
            "Ground Truth Text": ground_truth_text
        })
        
    df_results = pd.DataFrame(results_list)
    display(Markdown(f"<hr>"))
    display(Markdown(f"### üìä ‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö"))
    df_results["WAcc (After)"] = 1 - df_results["WER (After)"].fillna(0)
    df_results["CAcc (After)"] = 1 - df_results["CER (After)"].fillna(0)

    display_cols = [
        "File", "Time (s)", 
        "CER (After)", "WER (After)", 
        "CAcc (After)", "WAcc (After)"
    ]

    has_numeric_metrics = pd.to_numeric(df_results["CER (After)"], errors='coerce').notna().any()

    # Format Data 
    styler = df_results[display_cols].style.format({
        "Time (s)": "{:.2f}",
        "CER (After)": safe_percent_format,
        "WER (After)": safe_percent_format,
        "CAcc (After)": safe_percent_format, 
        "WAcc (After)": safe_percent_format, 
    }).background_gradient(
        cmap='RdYlGn_r', 
        subset=["CER (After)", "WER (After)"]
    ).background_gradient(
        cmap='YlGn', 
        subset=["CAcc (After)", "WAcc (After)"]
    ).set_properties(**{'text-align': 'left'}).set_table_styles(
        [dict(selector='th', props=[('text-align', 'left')])]
    )

    display(styler)
    display(Markdown(f"### üìà ‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏£‡∏ß‡∏°"))

    avg_metrics = df_results[display_cols[1:]].mean() 
    display(Markdown(f"- **‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['Time (s)']:.2f}` ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ/‡πÑ‡∏ü‡∏•‡πå"))

    if pd.notna(avg_metrics['CAcc (After)']):
        display(Markdown(f"- **Character Accuracy (CAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['CAcc (After)']:.2%}` (‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£)"))
        display(Markdown(f"- **Word Accuracy (WAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `{avg_metrics['WAcc (After)']:.2%}` (‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏Ñ‡∏≥)"))
    else:
        display(Markdown(f"- **Accuracy:** `N/A (‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• Ground Truth)`"))
        
    # OCR Test Display
    display(Markdown(f"<hr>"))
    display(Markdown(f"### üìù ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå OCR ‡∏â‡∏ö‡∏±‡∏ö‡πÄ‡∏ï‡πá‡∏° (‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡πà‡∏≠‡∏ô-‡∏´‡∏•‡∏±‡∏á Post-processing)"))

    for index, row in df_results.iterrows():
        display(Markdown(f"---"))
        display(Markdown(f"#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `{row['File']}`"))

        # Expander for Raw OCR Text
        html_raw = f"""
        <details>
            <summary>‡∏Ñ‡∏•‡∏¥‡∏Å‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏î‡∏π‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå OCR (Processed Text)</summary>
            <div style="background-color:#f7f7f7; border:1px solid #ddd; padding:10px; margin-top:10px; white-space: pre-wrap; word-wrap: break-word;">{row['Processed OCR Text']}</div>
        </details>
        """
        display(HTML(html_raw))

        # Expander for Processed Text
        html_processed = f"""
        <details>
            <summary>‡∏Ñ‡∏•‡∏¥‡∏Å‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏î‡∏π Ground Truth Text</summary>
            <div style="background-color:#e6ffed; border:1px solid #b7e1cd; padding:10px; margin-top:5px; white-space: pre-wrap; word-wrap: break-word;">{row['Ground Truth Text']}</div>
        </details>
        """
        display(HTML(html_processed))

### üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏£‡∏∞‡∏ö‡∏ö OCR ‡∏Å‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î 10 ‡πÑ‡∏ü‡∏•‡πå

‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÑ‡∏ü‡∏•‡πå:   0%|          | 0/10 [00:00<?, ?it/s]

   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1537 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1468 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1564 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 2846 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1835 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1537 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£
   - ‚úÖ ‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå Ground Truth!
     - SUCCESS: ‡∏≠‡πà‡∏≤‡∏ô‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‡∏°‡∏µ 1

<hr>

### üìä ‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö

Unnamed: 0,File,Time (s),CER (After),WER (After),CAcc (After),WAcc (After)
0,002-2.pdf,16.52,17.50%,57.45%,82.50%,42.55%
1,004-2.pdf,11.88,12.03%,42.86%,87.97%,57.14%
2,006-2.pdf,9.69,16.30%,41.13%,83.70%,58.87%
3,007-2.pdf,22.02,1.71%,17.20%,98.29%,82.80%
4,014-2.pdf,10.1,38.70%,58.24%,61.30%,41.76%
5,016-2.pdf,9.09,17.92%,46.81%,82.08%,53.19%
6,017-2.pdf,6.5,71.63%,81.75%,28.37%,18.25%
7,018-2.pdf,5.81,55.81%,69.23%,44.19%,30.77%
8,019-2.pdf,14.62,11.79%,44.74%,88.21%,55.26%
9,021-2.pdf,11.6,9.33%,38.81%,90.67%,61.19%


### üìà ‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏£‡∏ß‡∏°

- **‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `11.78` ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ/‡πÑ‡∏ü‡∏•‡πå

- **Character Accuracy (CAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `74.73%` (‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£)

- **Word Accuracy (WAcc) ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢:** `50.18%` (‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏Ñ‡∏≥)

<hr>

### üìù ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå OCR ‡∏â‡∏ö‡∏±‡∏ö‡πÄ‡∏ï‡πá‡∏° (‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡πà‡∏≠‡∏ô-‡∏´‡∏•‡∏±‡∏á Post-processing)

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `002-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `004-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `006-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `007-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `014-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `016-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `017-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `018-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `019-2.pdf`

---

#### üìÑ **‡πÑ‡∏ü‡∏•‡πå:** `021-2.pdf`