In [1]:
from doctr.models import detection

# Load the detection model
detector = detection.db_resnet50(pretrained=True)

# Load image
from PIL import Image
image = Image.open("image.jpg")

# Perform text detection
result = detector(image)

# Extract bounding boxes
bounding_boxes = result['boxes']
print("Bounding Boxes:", bounding_boxes)


ModuleNotFoundError: No module named 'doctr.models'

In [24]:
import cv2
import numpy as np

# Load the pre-trained EAST text detector model
MODEL_PATH = "frozen_east_text_detection.pb"
net = cv2.dnn.readNet(MODEL_PATH)

# Function to decode predictions and extract bounding boxes
def decode_predictions(scores, geometry, conf_threshold=0.5):
    (num_rows, num_cols) = scores.shape[2:4]
    boxes = []
    confidences = []

    for y in range(num_rows):
        for x in range(num_cols):
            # Confidence score
            score = scores[0, 0, y, x]
            if score < conf_threshold:
                continue

            # Geometry data
            offset_x = x * 4.0
            offset_y = y * 4.0

            angle = geometry[0, 4, y, x]
            cos = np.cos(angle)
            sin = np.sin(angle)

            h = geometry[0, 0, y, x]
            w = geometry[0, 1, y, x]

            # Compute the bounding box
            start_x = int(offset_x - (cos * w + sin * h) / 2)
            start_y = int(offset_y - (sin * w - cos * h) / 2)
            end_x = int(offset_x + (cos * w + sin * h) / 2)
            end_y = int(offset_y + (sin * w - cos * h) / 2)

            boxes.append((start_x, start_y, end_x, end_y))
            confidences.append(float(score))

    return boxes, confidences

# Load and preprocess the image
def process_image(image_path):
    image = cv2.imread(image_path)
    orig = image.copy()
    (orig_h, orig_w) = image.shape[:2]

    # Resize image to dimensions divisible by 32
    (new_w, new_h) = (320, 320)
    r_w = orig_w / float(new_w)
    r_h = orig_h / float(new_h)
    image = cv2.resize(image, (new_w, new_h))

    # Preprocess image for the model
    blob = cv2.dnn.blobFromImage(
        image, 1.0, (new_w, new_h), (123.68, 116.78, 103.94), swapRB=True, crop=False
    )
    return orig, blob, r_w, r_h

# Draw bounding boxes on the image
def draw_boxes(orig, boxes, confidences, confidence_threshold, r_w, r_h):
    # Apply Non-Maximum Suppression (NMS) to filter overlapping boxes
    box_array = np.array(boxes)
    conf_array = np.array(confidences)
    indices = cv2.dnn.NMSBoxes(boxes, confidences, confidence_threshold, 0.4)

    for i in indices.flatten():
        (start_x, start_y, end_x, end_y) = boxes[i]

        # Rescale boxes to the original image dimensions
        start_x = int(start_x * r_w)
        start_y = int(start_y * r_h)
        end_x = int(end_x * r_w)
        end_y = int(end_y * r_h)

        # Draw the bounding box
        cv2.rectangle(orig, (start_x, start_y), (end_x, end_y), (0, 255, 0), 2)

        # Display confidence score
        text = f"{confidences[i]:.2f}"
        cv2.putText(
            orig, text, (start_x, start_y - 10),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2
        )

    return orig

# Main function to detect text and show bounding boxes
def detect_text(image_path):
    # Process the image
    orig, blob, r_w, r_h = process_image(image_path)

    # Set the input to the model
    net.setInput(blob)

    # Get the output layers
    output_layers = ["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"]
    (scores, geometry) = net.forward(output_layers)

    # Decode predictions
    boxes, confidences = decode_predictions(scores, geometry)

    # Draw bounding boxes
    confidence_threshold = 0.2
    orig = draw_boxes(orig, boxes, confidences, confidence_threshold, r_w, r_h)

    # Display the output
    cv2.imshow("Text Detection", orig)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

# Path to the input image
IMAGE_PATH = "./samples/16a13165-e5cb-47e1-9efd-5794c6c41a89-1403_10_11.jpeg"

# Run the text detection
detect_text(IMAGE_PATH)

QObject::moveToThread: Current thread (0x332a0910) is not the object's thread (0x33c47730).
Cannot move to target thread (0x332a0910)

QObject::moveToThread: Current thread (0x332a0910) is not the object's thread (0x33c47730).
Cannot move to target thread (0x332a0910)

QObject::moveToThread: Current thread (0x332a0910) is not the object's thread (0x33c47730).
Cannot move to target thread (0x332a0910)

QObject::moveToThread: Current thread (0x332a0910) is not the object's thread (0x33c47730).
Cannot move to target thread (0x332a0910)

QObject::moveToThread: Current thread (0x332a0910) is not the object's thread (0x33c47730).
Cannot move to target thread (0x332a0910)

QObject::moveToThread: Current thread (0x332a0910) is not the object's thread (0x33c47730).
Cannot move to target thread (0x332a0910)

QObject::moveToThread: Current thread (0x332a0910) is not the object's thread (0x33c47730).
Cannot move to target thread (0x332a0910)

QObject::moveToThread: Current thread (0x332a0910) is n

In [25]:
import cv2
import numpy as np
import pytesseract
from craft import Craft

# Initialize the CRAFT text detector
craft = Craft(craft_path='./craft_mlt_25k.pth', cuda=False)  # Set cuda=True if GPU is available

def detect_text_with_craft(image_path):
    # Step 1: Text Detection using CRAFT
    detections = craft.detect_text(image_path)
    image = cv2.imread(image_path)

    # Extract detected boxes
    boxes = detections['boxes']

    # Draw bounding boxes for visualization
    for box in boxes:
        box = np.int0(box)  # Convert float coordinates to integers
        cv2.polylines(image, [box], isClosed=True, color=(0, 255, 0), thickness=2)

    # Display the image with bounding boxes
    cv2.imshow("CRAFT Text Detection", image)
    cv2.waitKey(0)

    # Step 2: OCR (Text Recognition) with Tesseract
    recognized_texts = []
    for box in boxes:
        # Get the bounding box region
        x_min = int(min(box[:, 0]))
        y_min = int(min(box[:, 1]))
        x_max = int(max(box[:, 0]))
        y_max = int(max(box[:, 1]))

        # Crop the region and preprocess for Tesseract
        cropped_region = image[y_min:y_max, x_min:x_max]
        gray = cv2.cvtColor(cropped_region, cv2.COLOR_BGR2GRAY)
        _, binary = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

        # Perform OCR using Tesseract
        text = pytesseract.image_to_string(binary, lang="eng", config="--psm 6")
        recognized_texts.append(text)

    # Cleanup resources
    craft.unload_craftnet_model()

    return recognized_texts


# Test the pipeline
if __name__ == "__main__":
    image_path = "samples/sample_image.jpg"  # Path to your input image
    detected_texts = detect_text_with_craft(image_path)
    print("Recognized Texts:")
    for i, text in enumerate(detected_texts, 1):
        print(f"{i}. {text.strip()}")


ImportError: cannot import name 'model_urls' from 'torchvision.models.vgg' (/mnt/Users/UT/Desktop/anaconda4/lib/python3.12/site-packages/torchvision/models/vgg.py)

In [29]:
import easyocr
import cv2

# Initialize the EasyOCR reader
reader = easyocr.Reader(['en', 'fa'], gpu=True)  # Add more languages as needed, e.g., ['en', 'fr']

# Load the image
image_path = "./samples/16a13165-e5cb-47e1-9efd-5794c6c41a89-1403_10_11.jpeg"
image = cv2.imread(image_path)

# Perform text detection and recognition
results = reader.readtext(image_path, detect_algorithm='DB')

# Loop through the results
for (bbox, text, confidence) in results:
    # Extract the bounding box coordinates
    (top_left, top_right, bottom_right, bottom_left) = bbox
    top_left = tuple(map(int, top_left))
    bottom_right = tuple(map(int, bottom_right))

    # Draw the bounding box
    cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2)

    # Display the recognized text and confidence
    cv2.putText(
        image, f"{text} ({confidence:.2f})", (top_left[0], top_left[1] - 10),
        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2
    )

# Display the output image with bounding boxes
cv2.imshow("Text Detection", image)
cv2.waitKey(0)
cv2.destroyAllWindows()

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


TypeError: Reader.readtext() got an unexpected keyword argument 'detect_algorithm'

In [None]:
from PIL import Image, ImageEnhance
import json
import easyocr
import os

def preprocess_image(image_path):
    try:
        # Open the image
        img = Image.open(image_path)

        # Convert to grayscale
        img = img.convert("L")

        # Enhance contrast
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(2.0)  # Increase contrast (factor 2.0)

        # Resize by 400%
        width, height = img.size
        img = img.resize((width * 4, height * 4), Image.Resampling.LANCZOS)

        return img
    except Exception as e:
        print(f"Error preprocessing {image_path}: {e}")
        return None


def easyocr1(image):
    temp_path = "temp_preprocessed_image.png"
    image.save(temp_path)

    # Initialize EasyOCR Reader
    reader = easyocr.Reader(['fa', 'en'], gpu=False)  # Add Persian (fa) and English (en) languages

    # Perform OCR using EasyOCR
    ocr_results = reader.readtext(temp_path, detail=1, paragraph=False)

    # Prepare results list
    easyocr_results = []
    for result in ocr_results:
        bbox, text, confidence = result

        # Prepare bounding box in terms of left, top, width, and height
        x1, y1 = bbox[0]
        x2, y2 = bbox[2]
        width = int(x2 - x1)
        height = int(bbox[2][1] - bbox[0][1])

        easyocr_results.append({
            "text": text,
            "confidence": float(confidence) * 100,
            "bounding_box": {
                "left": int(x1),
                "top": int(y1),
                "width": width,
                "height": height
            }
        })
    return easyocr_results


# Folder containing images
samples_folder = "./samples"
results_file = "./results.json"

# Initialize results dictionary
results = {}

# Process each image in the folder
for file_name in os.listdir(samples_folder):
    if file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif')):
        image_path = os.path.join(samples_folder, file_name)
        print(f"Processing {file_name}...")
        
        preprocessed_image = preprocess_image(image_path)
        if preprocessed_image is not None:
            # Perform OCR and save results
            results[file_name] = easyocr1(preprocessed_image)

# Save the results as JSON
with open(results_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print(f"OCR results saved to {results_file}")


Using CPU. Note: This module is much faster with a GPU.


Processing 64fbde6e-4d63-49ea-9770-a5f50bb32973-1403_10_11.jpeg...


Using CPU. Note: This module is much faster with a GPU.


Processing 30034b7a-7a44-4438-bc11-874d5cc40377-1403_10_11_fixed.jpeg...


Using CPU. Note: This module is much faster with a GPU.


Processing 4047eee8-89aa-4014-97dc-b395df9019ac-1403_10_11.jpeg...
