In [6]:
# Enhanced OCR: Batch Processing and Save to CSV
# Make sure required packages are installed:
# pip install pytesseract opencv-python pillow pandas numpy

import cv2
import pytesseract
from pytesseract import Output
from PIL import Image
import os
import pandas as pd

# --- Step 0: Set Tesseract path if needed ---
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# --- Step 1: Preprocess Image ---
def preprocess_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 11, 2)
    processed = cv2.medianBlur(thresh, 3)
    return processed

# --- Step 2: OCR Extraction ---
def extract_text(image_path):
    processed_img = preprocess_image(image_path)
    pil_img = Image.fromarray(processed_img)
    text = pytesseract.image_to_string(pil_img, lang='eng', config='--psm 6')
    return text.strip()

# --- Step 3: Batch Processing and Save ---
def batch_ocr(dataset_folder, output_csv="ocr_results.csv"):
    data_list = []

    for filename in os.listdir(dataset_folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(dataset_folder, filename)
            print(f"Processing: {filename}")
            text = extract_text(image_path)
            data_list.append({"filename": filename, "extracted_text": text})

    # Save results to CSV
    df = pd.DataFrame(data_list)
    df.to_csv(output_csv, index=False, encoding='utf-8')
    print(f"\nOCR results saved to {output_csv}")

# --- Step 4: Run on your IAM dataset folder ---
dataset_folder = "IAM_dataset_samples"  # Replace with your folder path
batch_ocr(dataset_folder)


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'IAM_dataset_samples'