In [1]:
from paddleocr import PaddleOCR
import cv2
import time
import tracemalloc

# --- Ground Truth ---
GROUND_TRUTH = [
    "computer", "manufacturers", "supplying", "them,", "these", "computers", "became", "far", "more", "affordable", "than", "the", "Mac.",
    "It", "was", "as", "if", "Steve", "Jobs", "and", "his", "company", "had", "gone", "into", "a", "time", "capsule", "when", "they", "started", "Next.",
    "They", "worked", "hard", "for", "years,", "competing", "against", "what", "they", "thought", "was", "the", "competition,", "but", "by", "the", "time", "they", "emerged,", "the", "competition", "turned", "out", "to", "be", "something", "completely", "different", "and", "much", "more", "powerful.",
    "Although", "they", "were", "oblivious", "to", "it,", "Next", "found", "itself", "in", "the", "midst", "of", "a", "strategic", "inflection", "point.",
    "The", "Next", "machine", "never", "took", "off.", "In", "fact,", "despite", "ongoing", "infusions", "of", "investors’", "cash,", "Next", "was", "hemorrhaging", "money.",
    "They", "were", "trying", "to", "maintain", "an", "expensive", "computer", "development", "operation,", "in", "addition", "to", "a", "state-of-the-art", "software", "development", "operation,", "plus", "a", "fully", "automated", "factory", "built", "to", "produce", "a", "large", "volume", "of", "Next", "computers—a", "large", "volume", "that", "never", "materialized.",
    "By", "1991,", "about", "six", "years", "after", "its", "founding,", "Next", "was", "in", "financial", "difficulties.",
    "Some", "managers", "inside", "the", "company", "had", "advocated", "throwing", "in", "the", "towel", "in", "hardware", "and", "porting", "their", "crown", "jewel", "to", "mass-produced", "PCs.",
    "Jobs", "resisted", "this", "for", "a", "long", "time.", "He", "didn’t", "like", "PCs.", "He", "thought", "they", "were", "inelegant", "and", "poorly", "engineered,", "and", "the", "many", "players", "in", "the", "industry", "made", "any", "kind", "of", "uniformity", "hard", "to", "achieve.",
    "In", "short,", "Jobs", "thought", "PCs", "were", "a", "mess.", "The", "thing", "is,", "he", "was", "right.",
    "But", "what", "Jobs", "missed", "at", "the", "time", "was", "that", "the", "very", "messiness", "of", "the", "PC", "industry", "that", "he", "despised", "was", "the", "result", "of", "its", "power:", "many", "companies", "competing", "to", "offer", "better", "value", "to", "ever", "larger", "numbers", "of", "customers.",
    "Some", "of", "his", "managers", "got", "frustrated", "and", "quit,", "yet", "their", "idea", "continued", "to", "ferment.",
    "As", "Next’s", "funds", "grew", "lower", "and", "lower,", "Jobs", "finally", "accepted", "the", "inevitability", "of", "the", "inelegant,", "messy", "PC", "industry", "as", "his", "environment.",
    "He", "threw", "his", "weight", "behind", "the", "proposal", "he", "had", "fought.", "He", "shut", "down", "all", "hardware", "development", "and", "the", "spanking", "new", "automated", "factory,", "and", "laid", "off", "half", "of", "his", "staff."
] 

# --- Preprocess ---
image_path = "C:\hope\Text Extraction Benchmarking\WhatsApp Image 2025-07-05 at 12.07.52_e6cd4776.jpg"  # Your uploaded image
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
sharpened = cv2.GaussianBlur(gray, (0, 0), 3)
sharpened = cv2.addWeighted(gray, 1.5, sharpened, -0.5, 0)

thresh = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                               cv2.THRESH_BINARY, 11, 2)
cv2.imwrite("preprocessed_book.jpg", thresh)

# --- Track time and memory ---
start_time = time.time()
tracemalloc.start()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
ocr = PaddleOCR(use_angle_cls=True, lang='en')
results = ocr.ocr("preprocessed_book.jpg", cls=True)


  ocr = PaddleOCR(use_angle_cls=True, lang='en')
[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in C:\Users\nimmy\.paddlex\official_models.[0m
Fetching 6 files: 100%|█████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 423.73it/s]
[32mCreating model: ('UVDoc', None)[0m
[33mThe model(UVDoc) is not supported to run in MKLDNN mode! Using `paddle` instead![0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in C:\Users\nimmy\.paddlex\official_models.[0m
Fetching 6 files: 100%|█████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 320.97it/s]
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in C:\Users\nimmy\.paddlex\official_models.[0

TypeError: predict() got an unexpected keyword argument 'cls'

In [None]:
predicted_text = []
for line in results[0]:
    predicted_text.append(line[1][0])
predicted_words = " ".join(predicted_text).split()

In [None]:
correct_words = sum(1 for word in predicted_words if word in GROUND_TRUTH)
total_words = len(GROUND_TRUTH)
accuracy = (correct_words / total_words) * 100 if total_words else 0

In [None]:
end_time = time.time()
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

In [None]:
output = {
    "Tool": "PaddleOCR",
    "Language(s)": "English",
    "Free or Paid": "Free (Open-source)",
    "Text Extracted": " ".join(predicted_text),
    "Correct Words": f"{correct_words}/{total_words}",
    "Accuracy": f"{accuracy:.2f}%",
    "Time Taken": f"{end_time - start_time:.2f} seconds",
    "Memory Used": f"{peak / 10**6:.2f} MB"
}


In [None]:
print(output)