# Korean OCR Preprocessing Method Comparison
This notebook compares different preprocessing methods for Korean OCR using PaddleOCR.

In [None]:
# SET UP & PACKAGES
from paddleocr import PaddleOCR
import os
import glob
import json
import cv2
import numpy as np
import pandas as pd
import subprocess
from pathlib import Path
import random
import utils

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# CONFIGURATION PATHS
image_path = "/home/phuc/Desktop/Project/Chung-Innovation/bill_test/bill_images" ## PATH TO RAW IMAGES FOLDER - CORRECTED TO MATCH GROUND TRUTH
preproc_path = "/home/phuc/Desktop/Project/Chung-Innovation/bill_test/bill_preprocessing" ## PATH TO PREPROCESSED IMAGES
groundtruth_path = "/home/phuc/Desktop/Project/Chung-Innovation/DATA_SET/ORIGINAL_TEXT" ## PATH TO GROUNDTRUTH FOLDER - CORRECTED PATH
txt_path = "/home/phuc/Desktop/Project/Chung-Innovation/bill_test/baseline_bill_txt/" ## PATH TO RESULT TEXT FOLDER
output_path = "/home/phuc/Desktop/Project/Chung-Innovation/bill_test/baseline_bill_output/" ## OUTPUT OF MODEL

# Create directories
os.makedirs(preproc_path, exist_ok=True)
os.makedirs(txt_path, exist_ok=True)

# PaddleOCR Model Class

In [None]:
class PaddleOCRmodel:
    def __init__(self, lang="korean"):
        """
        Initializes the PaddleOCR model.
        Args:
            lang (str): The language for OCR (default is "korean").
        """
        self.ocr = PaddleOCR(
            use_doc_orientation_classify=False,
            use_doc_unwarping=False,
            use_textline_orientation=False,

            lang=lang
        )

    def run_ocr_and_save_results(self, image_files, output_dir, txt_output_dir=None):
        """
        Runs PaddleOCR on a list of image files and saves the results.
        Args:
            image_files (list): A list of paths to the image files to process.
            output_dir (str): The directory to save the OCR results (JSON and image with boxes).
            txt_output_dir (str, optional): The directory to save the extracted text files.
        """
        if not image_files:
            print("No image files provided for OCR processing.")
            return

        print(f"üó®Ô∏è Processing {len(image_files)} image files for OCR.")
        os.makedirs(output_dir, exist_ok=True)
        if txt_output_dir:
             os.makedirs(txt_output_dir, exist_ok=True)

        for i, image_file in enumerate(image_files):
            print(f"\nProcessing: {i+1}/{len(image_files)}")
            try:
                result = self.ocr.predict(input=image_file)
                base_name = os.path.splitext(os.path.basename(image_file))[0]
                output_base_name = f"{base_name}"

                for i, res in enumerate(result):
                    res.save_to_img(os.path.join(output_dir, output_base_name))
                    res.save_to_json(os.path.join(output_dir, output_base_name))
                    break # Assuming one result per image

                print(f"Results saved to {output_base_name}.jpg and {output_base_name}.json")

            except Exception as e:
                print(f"Error processing {image_file}: {str(e)}")
                print("Skipping OCR for this image.")

        print(f"\nProcessing complete! Check the '{output_dir}' folder for results.")

# JSON to Text Conversion Functions

In [None]:
def extract_text_from_json(json_file_path):
    """Extract text from OCR JSON file and order it from top-left to bottom-right"""
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Get texts and their bounding boxes
    texts = data.get('rec_texts', [])
    boxes = data.get('rec_boxes', [])  # [x1, y1, x2, y2] format
    scores = data.get('rec_scores', [])

    # Combine text with coordinates for sorting
    text_with_coords = []
    for i, (text, box, score) in enumerate(zip(texts, boxes, scores)):
        x1, y1, x2, y2 = box
        text_with_coords.append({
            'text': text,
            'x': x1,
            'y': y1,
            'score': score,
            'box': box
        })

    # Sort by reading order (top to bottom, left to right)
    def sort_reading_order(items):
        items.sort(key=lambda item: item['y'])
        lines = []
        current_line = []

        for item in items:
            if not current_line:
                current_line.append(item)
            else:
                # If Y difference is small (same line), add to current line
                if abs(item['y'] - current_line[0]['y']) <= 20:
                    current_line.append(item)
                else:
                    # Sort current line by X coordinate (left to right)
                    current_line.sort(key=lambda x: x['x'])
                    lines.append(current_line)
                    current_line = [item]

        # Don't forget the last line
        if current_line:
            current_line.sort(key=lambda x: x['x'])
            lines.append(current_line)

        # Flatten the lines back to a single list
        sorted_items = []
        for line in lines:
            sorted_items.extend(line)

        return sorted_items

    # Sort the text in reading order
    sorted_text_items = sort_reading_order(text_with_coords)
    return sorted_text_items

def convert_json_to_txt(json_file_path, output_txt_path=None):
    """Convert OCR JSON file to readable text file"""
    if output_txt_path is None:
        base_name = os.path.splitext(os.path.basename(json_file_path))[0]
        if base_name.endswith('_res'):
            base_name = base_name[:-4]
        output_txt_path = os.path.join(os.path.dirname(json_file_path), f"{base_name}.txt")

    # Extract and sort text
    text_items = extract_text_from_json(json_file_path)

    # Write to text file
    with open(output_txt_path, 'w', encoding='utf-8') as f:
        for item in text_items:
            f.write(f"{item['text']}\n")

    return output_txt_path

# Run OCR on preprocessed images

In [None]:
def run_ocr_on_preprocessed_images(preprocessed_data):
    """
    Run OCR on preprocessed images and save results
    Args:
        preprocessed_data (list): List of image data dicts or file paths
    Returns:
        txt_output_dir (str): Path to directory containing OCR text results
    """
    print("üîç Starting OCR on Preprocessed Images...")
    print("=" * 60)

    # Initialize OCR model
    ocr_model = PaddleOCRmodel(lang="korean")

    # Create OCR output directory using global paths
    ocr_output_dir = output_path
    txt_output_dir = txt_path
    os.makedirs(ocr_output_dir, exist_ok=True)
    os.makedirs(txt_output_dir, exist_ok=True)

    print(f"üìÅ Processing {len(preprocessed_data)} preprocessed images")

    if not preprocessed_data:
        print("‚ùå No preprocessed images provided!")
        return txt_output_dir

    # Check data format
    is_image_data = isinstance(preprocessed_data[0], dict) and 'image_data' in preprocessed_data[0]
    print(f"üîß Processing mode: {'In-memory image data' if is_image_data else 'File paths'}")

    # Process images in batches
    batch_size = 10
    total_batches = (len(preprocessed_data) + batch_size - 1) // batch_size

    for batch_idx in range(total_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, len(preprocessed_data))
        batch_data = preprocessed_data[start_idx:end_idx]

        print(f"\nüì¶ Processing batch {batch_idx + 1}/{total_batches} ({len(batch_data)} images)")

        try:
            if is_image_data:
                # Process in-memory image data
                for data_item in batch_data:
                    image_data = data_item['image_data']
                    method = data_item['method']
                    original_name = data_item['original_name']
                    folder = data_item.get('folder', '')

                    print(f"\nProcessing in-memory: {original_name}_{method}")

                    try:
                        # Convert image data to format suitable for PaddleOCR
                        # PaddleOCR can accept numpy arrays directly
                        result = ocr_model.ocr.predict(input=image_data)

                        # Generate output name
                        base_name = f"{original_name}_experiment_{method}"

                        # Save OCR results
                        for i, res in enumerate(result):
                            res.save_to_img(os.path.join(ocr_output_dir, base_name))
                            res.save_to_json(os.path.join(ocr_output_dir, base_name))
                            break  # Assuming one result per image

                        print(f"Results saved to {base_name}.jpg and {base_name}.json")

                    except Exception as e:
                        print(f"Error processing in-memory image {original_name}_{method}: {str(e)}")
                        continue
            else:
                # Process file paths (original method)
                batch_files = batch_data
                ocr_model.run_ocr_and_save_results(batch_files, ocr_output_dir, txt_output_dir)

        except Exception as e:
            print(f"‚ùå Error processing batch {batch_idx + 1}: {str(e)}")
            continue

    # Convert JSON results to text files
    print("\nüìù Converting OCR results to text files...")

    json_files = glob.glob(os.path.join(ocr_output_dir, "**", "*.json"), recursive=True)
    print(f"Found {len(json_files)} JSON files to convert")

    converted_count = 0
    for json_file in json_files:
        try:
            base_name = os.path.splitext(os.path.basename(json_file))[0]
            if base_name.endswith('_res'):
                base_name = base_name[:-4]

            txt_output_path = os.path.join(txt_output_dir, f"{base_name}.txt")
            convert_json_to_txt(json_file, txt_output_path)
            converted_count += 1

        except Exception as e:
            print(f"‚ùå Error converting {json_file}: {str(e)}")

    print(f"\nüéâ OCR PROCESSING COMPLETE!")
    print(f"üìä Images processed: {len(preprocessed_data)}")
    print(f"üìù Text files created: {converted_count}")
    print(f"üìÅ OCR results saved in: {ocr_output_dir}")
    print(f"üìÅ Text results saved in: {txt_output_dir}")

    return txt_output_dir

# Preprocessing functions

In [None]:
def run_selected_preprocessing(method_names=None, return_image_data=False):
    """
    Run selected preprocessing methods on randomly selected images from each folder.
    Args:
        method_names (list or None): List of method names to use. If None, use all available methods.
        return_image_data (bool): If True, return image data in memory. If False, return file paths.
    Returns:
        preprocessed_images (list): List of preprocessed image data or paths ready for OCR.
    """
    print("üöÄ Starting Selected Preprocessing Pipeline...")
    print("=" * 60)

    # Use global path for output directory
    output_base = preproc_path
    os.makedirs(output_base, exist_ok=True)

    # All available preprocessing methods
    all_methods = {
        'gaussian_blur': lambda img, out: utils.experiment_gaussian_blur(img, out, kernel_size=5),
        'median_blur': lambda img, out: utils.experiment_median_blur(img, out, kernel_size=5),
        'bilateral_filter': lambda img, out: utils.experiment_bilateral_filter(img, out),
        'histogram_equalization': lambda img, out: utils.experiment_histogram_equalization(img, out),
        'clahe': lambda img, out: utils.experiment_clahe(img, out),
        'gamma_bright': lambda img, out: utils.experiment_gamma_correction(img, out, gamma=0.7),
        'gamma_dark': lambda img, out: utils.experiment_gamma_correction(img, out, gamma=1.3),
        'unsharp_masking': lambda img, out: utils.experiment_unsharp_masking(img, out),
        'otsu_threshold': lambda img, out: utils.experiment_otsu_threshold(img, out),
        'adaptive_gaussian': lambda img, out: utils.experiment_adaptive_threshold(img, out, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 11, 2),
        'adaptive_mean': lambda img, out: utils.experiment_adaptive_threshold(img, out, cv2.ADAPTIVE_THRESH_MEAN_C, 11, 2),
        'morphology_opening': lambda img, out: utils.experiment_morphology(img, out, 'opening', 3, 1),
        'morphology_closing': lambda img, out: utils.experiment_morphology(img, out, 'closing', 3, 1),
        'edge_laplacian': lambda img, out: utils.experiment_edge_enhancement(img, out, 'laplacian'),
        'edge_sobel': lambda img, out: utils.experiment_edge_enhancement(img, out, 'sobel_combined'),
        'canny_edge': lambda img, out: save_canny_result(utils.experiment_canny_edge(img, 100, 200), out),
        'deskew': lambda img, out: save_deskew_result(utils.experiment_deskew(img), out)
    }

    def save_canny_result(processed_img, output_path):
        if output_path:
            cv2.imwrite(output_path, processed_img)
        return processed_img

    def save_deskew_result(processed_img, output_path):
        if output_path:
            cv2.imwrite(output_path, processed_img)
        return processed_img

    # Modified methods for in-memory processing
    def get_modified_methods():
        if not return_image_data:
            return all_methods

        # For in-memory processing, we need methods that return image data
        return {
            'gaussian_blur': lambda img, out: utils.experiment_gaussian_blur(img, out if not return_image_data else None, kernel_size=5),
            'median_blur': lambda img, out: utils.experiment_median_blur(img, out if not return_image_data else None, kernel_size=5),
            'bilateral_filter': lambda img, out: utils.experiment_bilateral_filter(img, out if not return_image_data else None),
            'histogram_equalization': lambda img, out: utils.experiment_histogram_equalization(img, out if not return_image_data else None),
            'clahe': lambda img, out: utils.experiment_clahe(img, out if not return_image_data else None),
            'gamma_bright': lambda img, out: utils.experiment_gamma_correction(img, out if not return_image_data else None, gamma=0.7),
            'gamma_dark': lambda img, out: utils.experiment_gamma_correction(img, out if not return_image_data else None, gamma=1.3),
            'unsharp_masking': lambda img, out: utils.experiment_unsharp_masking(img, out if not return_image_data else None),
            'otsu_threshold': lambda img, out: utils.experiment_otsu_threshold(img, out if not return_image_data else None),
            'adaptive_gaussian': lambda img, out: utils.experiment_adaptive_threshold(img, out if not return_image_data else None, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 11, 2),
            'adaptive_mean': lambda img, out: utils.experiment_adaptive_threshold(img, out if not return_image_data else None, cv2.ADAPTIVE_THRESH_MEAN_C, 11, 2),
            'morphology_opening': lambda img, out: utils.experiment_morphology(img, out if not return_image_data else None, 'opening', 3, 1),
            'morphology_closing': lambda img, out: utils.experiment_morphology(img, out if not return_image_data else None, 'closing', 3, 1),
            'edge_laplacian': lambda img, out: utils.experiment_edge_enhancement(img, out if not return_image_data else None, 'laplacian'),
            'edge_sobel': lambda img, out: utils.experiment_edge_enhancement(img, out if not return_image_data else None, 'sobel_combined'),
            'canny_edge': lambda img, out: save_canny_result(utils.experiment_canny_edge(img, 100, 200), out if not return_image_data else None),
            'deskew': lambda img, out: save_deskew_result(utils.experiment_deskew(img), out if not return_image_data else None)
        }

    if method_names is None:
        preprocessing_methods = get_modified_methods()
    else:
        all_modified = get_modified_methods()
        preprocessing_methods = {k: v for k, v in all_modified.items() if k in method_names}
        if not preprocessing_methods:
            print("‚ùå No valid preprocessing methods selected!")
            return None

    # Get all images from the entire image_path directory
    image_base_path = image_path.replace("**/", "").replace("**", "")

    print(f"\nüìÅ Searching for images in: {image_base_path}")

    # Find all image files recursively in the entire image directory
    image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff']
    all_images = []
    for ext in image_extensions:
        all_images.extend(glob.glob(os.path.join(image_base_path, "**", ext), recursive=True))
        all_images.extend(glob.glob(os.path.join(image_base_path, "**", ext.upper()), recursive=True))

    if not all_images:
        print(f"   ‚ùå No images found in {image_base_path}")
        return []

    print(f"   üìã Found {len(all_images)} total images to process")
    print(f"   üöÄ Mode: {'In-memory processing' if return_image_data else 'File-based processing'}")

    total_processed = 0
    all_preprocessed_images = []  # List to collect all processed image data/paths

    # Process each image with all preprocessing methods
    for img_idx, image_file_path in enumerate(all_images, 1):
        image_name = Path(image_file_path).stem
        relative_path = os.path.relpath(image_file_path, image_base_path)
        folder_structure = os.path.dirname(relative_path)

        print(f"   üñºÔ∏è  Processing image {img_idx}/{len(all_images)}: {image_name} (from {folder_structure})")

        # Create output folder maintaining the original folder structure
        if folder_structure:
            image_output_dir = os.path.join(output_base, folder_structure, image_name)
        else:
            image_output_dir = os.path.join(output_base, image_name)
        os.makedirs(image_output_dir, exist_ok=True)

        # Apply each preprocessing method
        methods_processed = 0
        for method_name, method_func in preprocessing_methods.items():
            try:
                output_filename = f"{image_name}_experiment_{method_name}.jpg"
                output_path = os.path.join(image_output_dir, output_filename)

                # Apply preprocessing method
                processed_image = method_func(image_file_path, output_path if not return_image_data else None)

                if return_image_data:
                    # For in-memory processing, save to disk and store image data
                    if processed_image is not None:
                        cv2.imwrite(output_path, processed_image)  # Still save to disk
                        # Store image data with metadata
                        image_data = {
                            'image_data': processed_image,
                            'method': method_name,
                            'original_name': image_name,
                            'file_path': output_path,  # Keep for reference
                            'folder': folder_structure
                        }
                        all_preprocessed_images.append(image_data)
                        methods_processed += 1
                    else:
                        print(f"      ‚ùå Failed to process {method_name}")
                else:
                    # Verify output was created for file-based processing
                    if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
                        methods_processed += 1
                        all_preprocessed_images.append(output_path)
                    else:
                        print(f"      ‚ùå Failed to create output for {method_name}")

            except Exception as e:
                print(f"      ‚ùå Error with {method_name}: {str(e)}")
                continue
        print(f"      ‚úÖ Applied {methods_processed}/{len(preprocessing_methods)} methods")
        total_processed += methods_processed

    print(f"\nüéâ SELECTED PREPROCESSING COMPLETE!")
    print(f"üìä Total operations: {total_processed}")
    print(f"üìÅ Results saved in: {output_base}")
    if return_image_data:
        print(f"üìã Total preprocessed images in memory: {len(all_preprocessed_images)}")
    else:
        print(f"üìã Total preprocessed image files: {len(all_preprocessed_images)}")

    return all_preprocessed_images  # Return list of preprocessed image data or paths

# Fixed Evaluation System

In [None]:
def run_evaluation():
    """Minimal evaluation function that processes all files and saves to CSV"""
    # Find evaluation script
    eval_script = "/home/phuc/Desktop/Project/Chung-Innovation/ocr_eval_20250903.py"
    if not os.path.exists(eval_script):
        print("Evaluation script not found!")
        return None

    # Find OCR text files
    ocr_txt_dir = txt_path
    ocr_txt_files = glob.glob(os.path.join(ocr_txt_dir, "*experiment*.txt"))
    print(f"Found {len(ocr_txt_files)} OCR text files for evaluation")
    # Find ground truth files
    gt_files = glob.glob(os.path.join(groundtruth_path, "**", "*.txt.text"), recursive=True)
    gt_dict = {Path(f).name[:-9]: f for f in gt_files if f.endswith('.txt.text')}
    print(f"Found {len(gt_dict)} ground truth text files for evaluation")
    # Process all files
    results = []
    for ocr_file in ocr_txt_files:
        filename = Path(ocr_file).name
        if "_experiment_" in filename:
            base_name = filename.split("_experiment_")[0]
            method_name = filename.split("_experiment_")[1].replace(".txt", "")

            if base_name in gt_dict:
                try:
                    cmd = ['python', eval_script, gt_dict[base_name], ocr_file]
                    result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)

                    if result.returncode == 0:
                        # Parse evaluation output
                        metrics = {"file_name": filename, "method": method_name, "base_image": base_name}

                        for line in result.stdout.splitlines():
                            line = line.strip()
                            if line.startswith("TOTAL ERROR SCORE:"):
                                metrics['total_error'] = int(line.split(":")[1].strip())
                            elif line.startswith("NO. OF CHARS :"):
                                metrics['total_chars'] = int(line.split(":")[1].split(",")[0].strip())
                            elif line.startswith("NO. OF LINES :"):
                                metrics['total_lines'] = int(line.split(":")[1].split(",")[0].strip())

                        # Calculate accuracy
                        if metrics.get('total_chars', 0) > 0:
                            correct_chars = metrics['total_chars'] - metrics.get('total_error', 0)
                            metrics['accuracy'] = (correct_chars / metrics['total_chars']) * 100
                        else:
                            metrics['accuracy'] = 0.0

                        results.append(metrics)

                except Exception:
                    pass

    # Save to CSV
    if results:
        df = pd.DataFrame(results)
        df.to_csv('evaluation_results.csv', index=False)

        # Create summary by method
        summary = df.groupby('method').agg({
            'accuracy': ['count', 'mean', 'max', 'min', 'std']
        }).round(2)
        summary.columns = ['count', 'avg_accuracy', 'max_accuracy', 'min_accuracy', 'std_accuracy']
        summary.to_csv('evaluation_summary.csv')

        return df, summary

    return None

# Main Execution Pipeline

In [None]:
selected_methods = ['adaptive_gaussian','adaptive_mean','canny_edge', 'gamma_bright', 'gramma_dark', 'histogram_equalization', 'unsharp_masking' ]
selected_images = run_selected_preprocessing()

# Example 2: Run all methods (same as comprehensive)
# all_images = run_selected_preprocessing(method_names=None)

print(f"‚úÖ Selected preprocessing complete!")
print(f"üìã Generated {len(selected_images)} preprocessed images")
print(f"üîÑ Ready for OCR processing...")

# Run OCR on selected preprocessed images
if selected_images:
    ##ocr_results_dir = run_selected_preprocessing(selected_images)
    ocr_results_dir = run_ocr_on_preprocessed_images(selected_images)


üöÄ Starting Selected Preprocessing Pipeline...

üìÅ Searching for images in: /home/phuc/Desktop/Project/Chung-Innovation/bill_test/bill_images
   üìã Found 36 total images to process
   üöÄ Mode: File-based processing
   üñºÔ∏è  Processing image 1/36: DLSH5BI42N2_19 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 2/36: DLDR5BP24BK_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 3/36: DLPA5BP22T3_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 4/36: HSDR5B305N2_15 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 5/36: DLTS5BP23I1_14 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 3/36: DLPA5BP22T3_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 4/36: HSDR5B305N2_15 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 5/36: DLTS5BP23I1_14 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 6/36: DLDR5BP27N2_14 (from )

üöÄ Starting Selected Preprocessing Pipeline...

üìÅ Searching for images in: /home/phuc/Desktop/Project/Chung-Innovation/bill_test/bill_images
   üìã Found 36 total images to process
   üöÄ Mode: File-based processing
   üñºÔ∏è  Processing image 1/36: DLSH5BI42N2_19 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 2/36: DLDR5BP24BK_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 3/36: DLPA5BP22T3_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 4/36: HSDR5B305N2_15 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 5/36: DLTS5BP23I1_14 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 3/36: DLPA5BP22T3_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 4/36: HSDR5B305N2_15 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 5/36: DLTS5BP23I1_14 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 6/36: DLDR5BP27N2_14 (from )

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m


üöÄ Starting Selected Preprocessing Pipeline...

üìÅ Searching for images in: /home/phuc/Desktop/Project/Chung-Innovation/bill_test/bill_images
   üìã Found 36 total images to process
   üöÄ Mode: File-based processing
   üñºÔ∏è  Processing image 1/36: DLSH5BI42N2_19 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 2/36: DLDR5BP24BK_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 3/36: DLPA5BP22T3_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 4/36: HSDR5B305N2_15 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 5/36: DLTS5BP23I1_14 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 3/36: DLPA5BP22T3_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 4/36: HSDR5B305N2_15 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 5/36: DLTS5BP23I1_14 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 6/36: DLDR5BP27N2_14 (from )

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m


      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 35/36: DLJA5BI41K1_20 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 36/36: DLTS5B393N2_07 (from )
      ‚úÖ Applied 17/17 methods

üéâ SELECTED PREPROCESSING COMPLETE!
üìä Total operations: 612
üìÅ Results saved in: /home/phuc/Desktop/Project/Chung-Innovation/bill_test/bill_preprocessing
üìã Total preprocessed image files: 612
‚úÖ Selected preprocessing complete!
üìã Generated 612 preprocessed images
üîÑ Ready for OCR processing...
üîç Starting OCR on Preprocessed Images...


üöÄ Starting Selected Preprocessing Pipeline...

üìÅ Searching for images in: /home/phuc/Desktop/Project/Chung-Innovation/bill_test/bill_images
   üìã Found 36 total images to process
   üöÄ Mode: File-based processing
   üñºÔ∏è  Processing image 1/36: DLSH5BI42N2_19 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 2/36: DLDR5BP24BK_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 3/36: DLPA5BP22T3_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 4/36: HSDR5B305N2_15 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 5/36: DLTS5BP23I1_14 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 3/36: DLPA5BP22T3_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 4/36: HSDR5B305N2_15 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 5/36: DLTS5BP23I1_14 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 6/36: DLDR5BP27N2_14 (from )

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m


      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 35/36: DLJA5BI41K1_20 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 36/36: DLTS5B393N2_07 (from )
      ‚úÖ Applied 17/17 methods

üéâ SELECTED PREPROCESSING COMPLETE!
üìä Total operations: 612
üìÅ Results saved in: /home/phuc/Desktop/Project/Chung-Innovation/bill_test/bill_preprocessing
üìã Total preprocessed image files: 612
‚úÖ Selected preprocessing complete!
üìã Generated 612 preprocessed images
üîÑ Ready for OCR processing...
üîç Starting OCR on Preprocessed Images...


[32mCreating model: ('korean_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/korean_PP-OCRv5_mobile_rec`.[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/korean_PP-OCRv5_mobile_rec`.[0m


üöÄ Starting Selected Preprocessing Pipeline...

üìÅ Searching for images in: /home/phuc/Desktop/Project/Chung-Innovation/bill_test/bill_images
   üìã Found 36 total images to process
   üöÄ Mode: File-based processing
   üñºÔ∏è  Processing image 1/36: DLSH5BI42N2_19 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 2/36: DLDR5BP24BK_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 3/36: DLPA5BP22T3_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 4/36: HSDR5B305N2_15 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 5/36: DLTS5BP23I1_14 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 3/36: DLPA5BP22T3_13 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 4/36: HSDR5B305N2_15 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 5/36: DLTS5BP23I1_14 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 6/36: DLDR5BP27N2_14 (from )

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m


      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 35/36: DLJA5BI41K1_20 (from )
      ‚úÖ Applied 17/17 methods
   üñºÔ∏è  Processing image 36/36: DLTS5B393N2_07 (from )
      ‚úÖ Applied 17/17 methods

üéâ SELECTED PREPROCESSING COMPLETE!
üìä Total operations: 612
üìÅ Results saved in: /home/phuc/Desktop/Project/Chung-Innovation/bill_test/bill_preprocessing
üìã Total preprocessed image files: 612
‚úÖ Selected preprocessing complete!
üìã Generated 612 preprocessed images
üîÑ Ready for OCR processing...
üîç Starting OCR on Preprocessed Images...


[32mCreating model: ('korean_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/korean_PP-OCRv5_mobile_rec`.[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/korean_PP-OCRv5_mobile_rec`.[0m


üìÅ Processing 612 preprocessed images
üîß Processing mode: File paths

üì¶ Processing batch 1/62 (10 images)
üó®Ô∏è Processing 10 image files for OCR.

Processing: 1/10
Results saved to DLSH5BI42N2_19_experiment_gaussian_blur.jpg and DLSH5BI42N2_19_experiment_gaussian_blur.json

Processing: 2/10
Results saved to DLSH5BI42N2_19_experiment_gaussian_blur.jpg and DLSH5BI42N2_19_experiment_gaussian_blur.json

Processing: 2/10
Results saved to DLSH5BI42N2_19_experiment_median_blur.jpg and DLSH5BI42N2_19_experiment_median_blur.json

Processing: 3/10
Results saved to DLSH5BI42N2_19_experiment_median_blur.jpg and DLSH5BI42N2_19_experiment_median_blur.json

Processing: 3/10
Results saved to DLSH5BI42N2_19_experiment_bilateral_filter.jpg and DLSH5BI42N2_19_experiment_bilateral_filter.json

Processing: 4/10
Results saved to DLSH5BI42N2_19_experiment_bilateral_filter.jpg and DLSH5BI42N2_19_experiment_bilateral_filter.json

Processing: 4/10
Results saved to DLSH5BI42N2_19_experiment_histogram_e

In [None]:
# Convert JSON to Text files and save to txt_path
json_files = glob.glob(os.path.join(output_path, "**", "*.json"), recursive=True)
print(f"Converting {len(json_files)} JSON files to text...")

converted_count = 0
for json_file in json_files:
    try:
        # Get base name and remove _res suffix if present
        base_name = os.path.splitext(os.path.basename(json_file))[0]
        if base_name.endswith('_res'):
            base_name = base_name[:-4]

        # Create output path in txt_path directory
        txt_output_path = os.path.join(txt_path, f"{base_name}.txt")

        # Convert JSON to text file
        convert_json_to_txt(json_file, txt_output_path)
        converted_count += 1

    except Exception as e:
        print(f"‚ùå Error converting {json_file}: {str(e)}")

print(f"‚úÖ Converted {converted_count} JSON files to text in {txt_path}")

print("üîÑ RUNNING EVALUATION...")


results = run_evaluation()

if results is not None:
    df, summary = results
    print(f"‚úÖ Evaluation complete! {len(df)} results saved to CSV")
else:
    print("‚ùå Evaluation failed")

Converting 612 JSON files to text...
‚úÖ Converted 612 JSON files to text in /home/phuc/Desktop/Project/Chung-Innovation/bill_test/baseline_bill_txt/
üîÑ RUNNING EVALUATION...
Found 612 OCR text files for evaluation
Found 3071 ground truth text files for evaluation
‚úÖ Evaluation complete! 612 results saved to CSV
‚úÖ Evaluation complete! 612 results saved to CSV


# Results Analysis

In [None]:
# RESULTS ANALYSIS
try:
    df = pd.read_csv('evaluation_results.csv')
    summary = pd.read_csv('evaluation_summary.csv', index_col=0)

    print(f"üìä Evaluation Results:")
    print(f"   Total files: {len(df)}")
    print(f"   Average accuracy: {df['accuracy'].mean():.1f}%")
    print(f"   Best accuracy: {df['accuracy'].max():.1f}%")

    print(f"\nüèÜ Method Rankings:")
    top_methods = summary.sort_values('avg_accuracy', ascending=False)

    for rank, (method, row) in enumerate(top_methods.iterrows(), 1):
        print(f"{rank:2d}. {method:<20} {row['avg_accuracy']:.1f}% [{row['count']:.0f} files]")

except Exception as e:
    print(f"‚ùå Error reading results: {str(e)}")
    print("Please run the evaluation first.")

üìä Evaluation Results:
   Total files: 612
   Average accuracy: -455.3%
   Best accuracy: 99.1%

üèÜ Method Rankings:
 1. gamma_bright         34.7% [36 files]
 2. clahe                28.2% [36 files]
 3. gamma_dark           25.1% [36 files]
 4. unsharp_masking      12.9% [36 files]
 5. gaussian_blur        -106.3% [36 files]
 6. histogram_equalization -137.0% [36 files]
 7. deskew               -340.1% [36 files]
 8. bilateral_filter     -363.9% [36 files]
 9. morphology_opening   -411.2% [36 files]
10. otsu_threshold       -423.1% [36 files]
11. morphology_closing   -516.0% [36 files]
12. adaptive_gaussian    -618.3% [36 files]
13. edge_sobel           -690.8% [36 files]
14. edge_laplacian       -801.2% [36 files]
15. adaptive_mean        -921.3% [36 files]
16. canny_edge           -1122.5% [36 files]
17. median_blur          -1389.3% [36 files]


# Run OCR without preprocess for baseline

In [None]:
image = glob.glob(os.path.join(image_path, "**", "*.jpg"), recursive=True)

ocr = PaddleOCRmodel(lang="korean")

for img in image:
    result = ocr.ocr.predict(input=img)
    base_name = os.path.splitext(os.path.basename(img))[0]
    output_base_name = f"{base_name}"

    for i, res in enumerate(result):
        res.save_to_img(os.path.join(output_path, output_base_name))
        res.save_to_json(os.path.join(output_path, output_base_name))
        break # Assuming one result per image

    print(f"Results saved to {output_base_name}.jpg and {output_base_name}.json")

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mCreating model: ('korean_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/korean_PP-OCRv5_mobile_rec`.[0m
[32mCreating model: ('korean_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/korean_PP-OCRv5_mobile_rec`.[0m


[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mCreating model: ('korean_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/korean_PP-OCRv5_mobile_rec`.[0m
[32mCreating model: ('korean_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/phuc/.paddlex/official_models/korean_PP-OCRv5_mobile_rec`.[0m


Exception: Image read Error: /home/phuc/Desktop/Project/Chung-Innovation/bill_test/bill_images/DLSH5BI42N2_19.jpg

In [None]:
# Temporary: Remove "baseline_bill_" prefix from output files
import os
import glob

files_to_rename = glob.glob(os.path.join(output_path, "*baseline_bill_*"))
print(f"Found {len(files_to_rename)} files with baseline_bill_ prefix")

for old_file in files_to_rename:
    old_name = os.path.basename(old_file)
    new_name = old_name.replace("baseline_bill_output", "").replace("baseline_bill_", "")
    new_file = os.path.join(output_path, new_name)

    try:
        os.rename(old_file, new_file)
        print(f"Renamed: {old_name} -> {new_name}")
    except Exception as e:
        print(f"Error renaming {old_name}: {e}")

print("‚úÖ File renaming complete!")

Found 612 files with baseline_bill_ prefix
Renamed: baseline_bill_outputHSDR5A102B2_12_experiment_median_blur -> HSDR5A102B2_12_experiment_median_blur
Renamed: baseline_bill_outputDLDR5BP24BK_14_experiment_adaptive_mean -> DLDR5BP24BK_14_experiment_adaptive_mean
Renamed: baseline_bill_outputDLTS5BP24B1_13_experiment_edge_laplacian -> DLTS5BP24B1_13_experiment_edge_laplacian
Renamed: baseline_bill_outputHSDR5B305N2_14_experiment_median_blur -> HSDR5B305N2_14_experiment_median_blur
Renamed: baseline_bill_outputDLPA5BP22BK_10_experiment_unsharp_masking -> DLPA5BP22BK_10_experiment_unsharp_masking
Renamed: baseline_bill_outputHSDR5B203I1_12_experiment_morphology_opening -> HSDR5B203I1_12_experiment_morphology_opening
Renamed: baseline_bill_outputHSDR5B202I2_11_experiment_bilateral_filter -> HSDR5B202I2_11_experiment_bilateral_filter
Renamed: baseline_bill_outputHSDR5B402T2_08_experiment_gamma_dark -> HSDR5B402T2_08_experiment_gamma_dark
Renamed: baseline_bill_outputDLPA5BP22BK_10_experimen