In [61]:
import Levenshtein
import re

def compute_nld(s1, s2):
    """Compute Normalized Levenshtein Distance (NLD)."""
    s1 = re.sub(r'\s+', ' ', s1.strip().lower())
    s2 = re.sub(r'\s+', ' ', s2.strip().lower())
    distance = Levenshtein.distance(s1, s2)
    return distance / max(len(s1), len(s2))

def word_level_accuracy(gt, pred):
    """Compute word-level accuracy and F1 score."""
    gt_words = gt.split()
    pred_words = pred.split()
    
    common = set(gt_words) & set(pred_words)
    accuracy = len(common) / len(gt_words) if gt_words else 0.0
    precision = len(common) / len(pred_words) if pred_words else 0.0
    recall = accuracy
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    return accuracy, f1


In [62]:
dataset = [
    ("Incorrect_Images/Incorrect_SOTP_sign.jpg", "SOTP"),
    ("GenAI_Dataset/Imagen(Gemini)/Gemini_Generated_Image_1mqqb41mqqb41mqq.jpg", "LIMITLESS POSSIBBITIES"),
    ("Incorrect_Images/Incorrect_Happy_Birthday.png.webp", "HAPPP  Hanpdday Birthday"),
    ("GenAI_Dataset/Imagen(Gemini)/Gemini_Generated_Image_r8x05sr8x05sr8x0.jpg", "nice to met you"),
    ("Incorrect_Images/incorrect_parking.jpg", "No UNORTHERISED PARKING THE COMMITTEE"),

]


In [63]:
import pandas as pd
from PIL import Image

corrector = AITextCorrector()  # Initialize your model

results = []

for img_path, gt_text in dataset:
    image = Image.open(img_path)
    num_words = len(gt_text.split())  # Count the number of words in the text
    pred_text = corrector.recognize_text(image)  # Run OCR on the corrected image
    
    word_acc, f1 = word_level_accuracy(gt_text, pred_text)
    print(gt_text,pred_text)
    
    results.append({
        "Text Length": num_words,
        "F1 Score": f1,
        "Word Accuracy": word_acc
    })

df_results = pd.DataFrame(results)


Using device: cpu


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

OCR Output: SOTP
SOTP SOTP
OCR Output: LIMITLESS PosiBLITEs
LIMITLESS POSSIBBITIES LIMITLESS PosiBLITEs
OCR Output: HAPPP  Hanpdday Birthday
HAPPP  Hanpdday Birthday HAPPP  Hanpdday Birthday
OCR Output: te Mice met Yeil
nice to met you te Mice met Yeil
OCR Output: No UNORTHERISED PARKING THE (OMMITTEE
No UNORTHERISED PARKING THE COMMITTEE No UNORTHERISED PARKING THE (OMMITTEE


In [64]:
print(df_results)

   Text Length  F1 Score  Word Accuracy
0            1      1.00           1.00
1            2      0.50           0.50
2            3      1.00           1.00
3            4      0.25           0.25
4            5      0.80           0.80


In [54]:
grouped = df_results.groupby("Text Length").agg({"F1 Score": "mean"}).reset_index()
grouped.rename(columns={"Text Length": "Number of Words", "F1 Score": "Avg F1 Score"}, inplace=True)

print(grouped.to_latex(index=False))


\begin{tabular}{rr}
\toprule
Number of Words & Avg F1 Score \\
\midrule
1 & 1.000000 \\
3 & 0.800000 \\
5 & 0.800000 \\
\bottomrule
\end{tabular}



In [44]:
import torch
import gc
from transformers import BlipProcessor, BlipForConditionalGeneration
from diffusers import StableDiffusionInpaintPipeline
from PIL import Image, ImageDraw
import numpy as np
import easyocr
import networkx as nx
from scipy.optimize import linear_sum_assignment
from accelerate import infer_auto_device_map, dispatch_model

class AITextCorrector:
    def __init__(self, blip_model="Salesforce/blip-image-captioning-base",
                 model_name="stabilityai/stable-diffusion-2-inpainting"):
        """
        Initialize models: BLIP for context-aware correction, and TextDiffuser for inpainting.
        """
        # if torch.backends.mps.is_available():
        #     self.device = torch.device("mps")  # Use Apple MPS
        if torch.cuda.is_available():
            self.device = torch.device("cuda")  # Use CUDA if available (not applicable for Macs)
        else:
            self.device = torch.device("cpu")  # Default to CPU

        print("Using device:", self.device)

        # Captioning - BLIP-2
        self.blip_processor = BlipProcessor.from_pretrained(blip_model)
        self.blip_model = BlipForConditionalGeneration.from_pretrained(blip_model).to(self.device)

        # Text Inpainting - TextDiffuser
        self.model = StableDiffusionInpaintPipeline.from_pretrained(model_name).to(self.device)

        # Traditional OCR for Bounding Box Detection
        self.easyocr_model = easyocr.Reader(['en'])

    def detect_text_boxes(self, image):
        """
        Detects text regions using EasyOCR and extracts bounding boxes.
        """
        image_np = np.array(image)
        ocr_results = self.easyocr_model.readtext(image_np)
        return [{"coordinates": result[0], "text": result[1]} for result in ocr_results]

    def recognize_text(self, image):
        """
        Recognizes text in the image using EasyOCR.
        """
        image_np = np.array(image)
        ocr_results = self.easyocr_model.readtext(image_np)

        recognized_text = " ".join([result[1] for result in ocr_results])  # Join detected text pieces
        print(f"OCR Output: {recognized_text}")
        return recognized_text

    def generate_caption(self, image):
        """
        Generates a descriptive caption for the image using BLIP-2.
        """
        inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
        pixel_values = inputs["pixel_values"]  # Extract pixel values
        with torch.no_grad():
            outputs = self.blip_model.generate(pixel_values=pixel_values)  # Pass pixel_values explicitly
        return self.blip_processor.decode(outputs[0], skip_special_tokens=True)

    def correct_text(self, extracted_text, caption, image):
        """
        Uses BLIP-2 to refine extracted text based on image caption context.
        """
        inputs = self.blip_processor(images=image, text=f"Correct this text: {extracted_text} in context: {caption}", return_tensors="pt").to(self.device)
        pixel_values = inputs["pixel_values"]  # Extract pixel values
        with torch.no_grad():
            outputs = self.blip_model.generate(pixel_values=pixel_values)  # ✅ Pass explicitly
        return self.blip_processor.decode(outputs[0], skip_special_tokens=True)

    def create_mask(self, image_size, coordinates):
        """
        Creates a binary mask for the text regions.
        """
        mask = Image.new('L', image_size, 0)
        draw = ImageDraw.Draw(mask)
        draw.polygon([tuple(point) for point in coordinates], outline=255, fill=255)
        return mask

    def graph_based_text_alignment(self, detected_boxes):
        """
        Uses a graph-based Hungarian Matching algorithm to align detected text positions.
        """
        num_boxes = len(detected_boxes)
        cost_matrix = np.zeros((num_boxes, num_boxes))

        for i in range(num_boxes):
            for j in range(num_boxes):
                if i != j:
                    # Distance-based cost function
                    x1, y1 = np.mean(detected_boxes[i]['coordinates'], axis=0)
                    x2, y2 = np.mean(detected_boxes[j]['coordinates'], axis=0)
                    cost_matrix[i, j] = np.linalg.norm(np.array([x1, y1]) - np.array([x2, y2]))

        row_ind, col_ind = linear_sum_assignment(cost_matrix)
        aligned_boxes = [detected_boxes[i] for i in row_ind]
        return aligned_boxes

    def inpaint_text(self, image, mask, corrected_text):
        """
        Inpaints the corrected text using TextDiffuser.
        """
        return self.model(prompt=f"Generate text '{corrected_text}' in a matching style", image=image, mask_image=mask, num_inference_steps=50, guidance_scale=7.5).images[0]

    def run_pipeline(self, image):
        """
        Runs the complete text correction pipeline.
        """
        text_boxes = self.detect_text_boxes(image)
        caption = self.generate_caption(image)
        aligned_boxes = self.graph_based_text_alignment(text_boxes)

        corrected_image = image.copy()

        for box in aligned_boxes:
            original_text = box["text"]
            corrected_text = self.correct_text(original_text, caption, image)

            if corrected_text.strip() == original_text.strip():
                continue  # Skip if no correction needed

            mask = self.create_mask(image.size, box["coordinates"])
            mask = mask.resize(image.size)  # Ensure mask is the same size as image
            inpainted_region = self.inpaint_text(corrected_image, mask, corrected_text)

            # Blend the corrected text back into the image
            full_mask = Image.new('L', corrected_image.size, 0)
            full_mask.paste(mask, (0, 0))
            mask = mask.resize(image.size, Image.LANCZOS)

            # Ensure the inpainted region is the same size as the mask
            inpainted_region = inpainted_region.resize(image.size, Image.LANCZOS)

            # Debugging: Print sizes before pasting
            print("Original image size:", image.size)
            print("Mask size:", mask.size)
            print("Inpainted region size:", inpainted_region.size)

            # Paste the inpainted region back into the corrected image
            corrected_image.paste(inpainted_region, (0, 0), mask)

        return corrected_image
