In [None]:
!pip install jiwer easyocr pytesseract paddleocr paddlepaddle google-generativeai
!apt install tesseract-ocr

Collecting paddleocr
  Downloading paddleocr-2.10.0-py3-none-any.whl.metadata (12 kB)
Collecting paddlepaddle
  Downloading paddlepaddle-3.0.0-cp311-cp311-manylinux1_x86_64.whl.metadata (8.9 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.6.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt_einsum==3.3.0 (from paddlepaddle)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Downloading paddleocr-2.10.0-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [14]:
import os
import sys


# Detect if running in Google Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "/content/drive/MyDrive/OCR evaluation"
else:
    base_dir = "path/to/your/local/project/folder" # add directory if running locally

data_dir = os.path.join(base_dir, "Data")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
with open(os.path.join(data_dir, "manual text.txt"), "r") as f:
  ground_truth = f.read()
img_path = os.path.join(data_dir, "download.png")

In [16]:
import re
from jiwer import wer, cer  # Make sure jiwer is installed: pip install jiwer

def ocr_evaluation(ground_truth, hypothesis):
    """
    Evaluate OCR output using Word Error Rate (WER) and Character Error Rate (CER).

    Parameters:
        ground_truth (str): The reference correct text.
        hypothesis (str): The OCR-generated text.

    Returns:
        dict: A dictionary with rounded WER and CER scores.
    """
    # Clean and normalise input
    ground_truth = re.sub(r'[^\w\s]', '', str(ground_truth)).lower().strip()
    hypothesis = re.sub(r'[^\w\s]', '', str(hypothesis)).lower().replace("\n", " ").strip()

    # Compute error rates
    error_wer = round(wer(ground_truth, hypothesis), 2)
    error_cer = round(cer(ground_truth, hypothesis), 2)

    return {
        "WER": error_wer,
        "CER": error_cer
    }


In [17]:
import easyocr
import pytesseract
from PIL import Image
import cv2
img = cv2.imread(img_path)

reader = easyocr.Reader(['en'])
results = reader.readtext(img_path)
easyocr_results = " ".join([result[1] for result in results])
print(f'easyocr complete, populated with {len(easyocr_results)} characters')
custom_config = r'--oem 1 --psm 6'
tesseract_results = pytesseract.image_to_string(img, config=custom_config)
print(f'tesseract complete, populated with {len(tesseract_results)} characters')



easyocr complete, populated with 5613 characters
tesseract complete, populated with 6851 characters


In [18]:
from paddleocr import PaddleOCR, draw_ocr
import numpy as np
from PIL import Image

# Load PaddleOCR model (only once)
ocr = PaddleOCR(use_angle_cls=True, lang="en")  # You can also set `det=False` if only running recognition

# Convert to NumPy array if input was a PIL or OpenCV image
img_np = np.array(img)

# Run OCR
result = ocr.ocr(img_np, cls=False)

# Extract and flatten results
txts = []
for block in result:
    for line in block:
        text = line[1][0]  # the actual string
        txts.append(text)

# Join all results into a clean string
paddleocr_result = " ".join(txts).strip().lower()
print(f'paddle complete with {len(paddleocr_result)} characters')


[2025/04/25 09:41:35] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_l

WER is word error rate. 0.75 means 3/4 of the words were incorrect.
CER is character error rate.
So, if you are new to this, you are looking for the lowest number.

In [None]:

print("EasyOCR Evaluation:")
print(ocr_evaluation(ground_truth, easyocr_results))

print("Tesseract Evaluation:")
print(ocr_evaluation(ground_truth, tesseract_results))

print("PaddleOCR Evaluation:")
print(ocr_evaluation(ground_truth, paddleocr_result))


EasyOCR Evaluation:
{'WER': 0.89, 'CER': 0.67}
Tesseract Evaluation:
{'WER': 0.69, 'CER': 0.43}
PaddleOCR Evaluation:
{'WER': 0.79, 'CER': 0.76}


Now that we have tried the most popular packages, we will move on to a LLM.
You could use the most effective package in concert with a LLM, but take care over chunking and truncated responses due to their limited tokens

In [None]:
key = #insert your api key here
import google.generativeai as genai
# Set your API key
genai.configure(api_key=key)
# Get the Gemini Pro model
model = genai.GenerativeModel("gemini-flash-2.0")

In [None]:
image = Image.open(img_path)
gemini_response = model.generate_content(["Extract the text of this image:", image])
gemini_result = gemini_response.text.lower().strip()


In [None]:
print("Gemini Evaluation:")
print(ocr_evaluation(ground_truth, gemini_result))

In [None]:
import pandas as pd
import os


# Ensure all results are lowercased, stripped strings
easyocr_results = easyocr_results.lower().strip()
tesseract_results = tesseract_results.lower().strip()
paddleocr_result = paddleocr_result.lower().strip()
gemini_output = gemini_result.text.lower().strip()

# Evaluate and collect scores
summary_data = {
    "Engine": ["EasyOCR", "Tesseract", "PaddleOCR", "Gemini"],
    "WER": [
        round(wer(ground_truth, easyocr_results), 2),
        round(wer(ground_truth, tesseract_results), 2),
        round(wer(ground_truth, paddleocr_result), 2),
        round(wer(ground_truth, gemini_output), 2)
    ],
    "CER": [
        round(cer(ground_truth, easyocr_results), 2),
        round(cer(ground_truth, tesseract_results), 2),
        round(cer(ground_truth, paddleocr_result), 2),
        round(cer(ground_truth, gemini_output), 2)
    ],
    "Char Count": [
        len(easyocr_results),
        len(tesseract_results),
        len(paddleocr_result),
        len(gemini_output)
    ]
}

# Create DataFrame
df_summary = pd.DataFrame(summary_data)

# Display and (optionally) save
print("🔍 Final OCR Comparison Summary:")
print(df_summary)

# Save to CSV if desired
df_summary.to_csv(os.path.join(output_dir, "ocr_evaluation_summary.csv"), index=False)
