In [None]:
pip install transformers diffusers torch torchvision torchaudio pillow easyocr safetensors numpy accelerate --index-url https://download.pytorch.org/whl/cu118


Looking in indexes: https://download.pytorch.org/whl/cu118
[31mERROR: Could not find a version that satisfies the requirement easyocr (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for easyocr[0m[31m
[0m

In [None]:
pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (422 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m422.9/422.9 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from diffusers import StableDiffusionInpaintPipeline
from PIL import Image, ImageDraw
import easyocr
import numpy as np


class ImprovedTextCorrector:
    def __init__(self, inpaint_model_path, t5_model_path="vennify/t5-base-grammar-correction"):
        # Initialize OCR, Inpainting, and Text Correction Models
        self.ocr_model = easyocr.Reader(['en'])
        self.inpaint_model = StableDiffusionInpaintPipeline.from_pretrained(
            inpaint_model_path,
            torch_dtype=torch.float32,
            use_safetensors=False
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        self.t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_path)
        self.t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_path)

    def preprocess_text(self, text):
        # Clean up OCR-detected text (remove unexpected characters)
        clean_text = ''.join(e for e in text if e.isalnum() or e.isspace())
        print(f"Preprocessed text: '{clean_text}'")  # Print preprocessed text for debugging
        return clean_text

    def detect_and_localize(self, image):
        # Detect and localize text in the image using OCR
        image_np = np.array(image)
        ocr_results = self.ocr_model.readtext(image_np)
        return [{"coordinates": result[0], "text": result[1]} for result in ocr_results]

    def correct_text(self, text):
        # Simplified input prompt for text correction
        input_text = f"grammar: {text}"
        input_ids = self.t5_tokenizer(input_text, return_tensors="pt").input_ids

        # Generate corrected text
        outputs = self.t5_model.generate(input_ids, max_length=50)
        corrected_text = self.t5_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

        # Debug print: Check what the model corrected
        print(f"Original text: '{text}' -> Corrected text: '{corrected_text}'")

        # Validate and return corrected text
        if corrected_text.lower() == text.lower() or not corrected_text.strip():
            print(f"No meaningful correction for: {text}")
            return text  # Return original text if correction is invalid

        # Log the word-by-word replacement for clarity
        original_words = text.split()
        corrected_words = corrected_text.split()

        # Check word-by-word and print which words were replaced
        for original, corrected in zip(original_words, corrected_words):
            if original != corrected:
                print(f"Replaced word: '{original}' -> '{corrected}'")

        return corrected_text


    def create_mask(self, image_size, coordinates):
        # Create a binary mask for the region containing the text
        mask = Image.new('L', image_size, 0)
        draw = ImageDraw.Draw(mask)
        draw.polygon([tuple(point) for point in coordinates], outline=255, fill=255)
        return mask

    def inpaint_text(self, image, mask, prompt):
        # Prepare image and mask for inpainting
        image = image.convert("RGB")
        mask = mask.convert("L")

        # Ensure dimensions are multiples of 8
        width, height = image.size
        new_width = ((width - 1) // 8 + 1) * 8
        new_height = ((height - 1) // 8 + 1) * 8
        image = image.resize((new_width, new_height), Image.LANCZOS)
        mask = mask.resize((new_width, new_height), Image.LANCZOS)

        # Perform inpainting
        inpainted_image = self.inpaint_model(
            prompt=prompt,
            image=image,
            mask_image=mask,
            num_inference_steps=50,
            guidance_scale=7.5
        ).images[0]

        # Resize back to original dimensions
        inpainted_image = inpainted_image.resize((width, height), Image.LANCZOS)
        return inpainted_image

    def run_pipeline(self, image):
        # Detect and localize text
        text_boxes = self.detect_and_localize(image)
        corrected_image = image.copy()

        for box in text_boxes:
            # Preprocess and correct text
            original_text = self.preprocess_text(box["text"])
            corrected_text = self.correct_text(original_text)

            # Skip inpainting if no correction is made
            if corrected_text.strip() == original_text.strip():
                continue

            # Create mask for text region
            mask = self.create_mask(image.size, box["coordinates"])

            # Define inpainting prompt
            inpaint_prompt = f"A clear, legible text saying '{corrected_text}' in the same style and color as the surrounding text"

            # Perform inpainting
            inpainted_region = self.inpaint_text(corrected_image, mask, inpaint_prompt)

            # Apply the inpainted region back to the image
            full_mask = Image.new('L', corrected_image.size, 0)
            full_mask.paste(mask, (0, 0))
            corrected_image.paste(inpainted_region, (0, 0), full_mask)

        return corrected_image


# Example usage
if __name__ == "__main__":
    text_corrector = ImprovedTextCorrector(
        inpaint_model_path="runwayml/stable-diffusion-inpainting"
    )

    input_image = Image.open("/content/fixing-text-gibberish-in-ai-generated-images-v0-uvsmxm4zerib1.png.webp")
    output_image = text_corrector.run_pipeline(input_image)
    output_image.save("corrected_image.png")


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Preprocessed text: 'HAPPPT'
Original text: 'HAPPPT' -> Corrected text: 'HAPPPT!'
Replaced word: 'HAPPPT' -> 'HAPPPT!'


  0%|          | 0/50 [00:00<?, ?it/s]

Preprocessed text: 'Hanpdday'
Original text: 'Hanpdday' -> Corrected text: 'Hanpdday is Hanpdday.'


  0%|          | 0/50 [00:00<?, ?it/s]

Preprocessed text: 'Birthday'
Original text: 'Birthday' -> Corrected text: 'Birthday! Birthday!'
Replaced word: 'Birthday' -> 'Birthday!'


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, BlipProcessor, BlipForConditionalGeneration
from diffusers import StableDiffusionInpaintPipeline
from PIL import Image, ImageDraw
import easyocr
import numpy as np

class EnhancedTextCorrector:
    def __init__(self, inpaint_model_path, t5_model_path="google/flan-t5-large", blip_model_path="Salesforce/blip-image-captioning-base"):
        self.ocr_model = easyocr.Reader(['en'])
        self.inpaint_model = StableDiffusionInpaintPipeline.from_pretrained(
            inpaint_model_path,
            torch_dtype=torch.float16
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        self.t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_path).to("cuda" if torch.cuda.is_available() else "cpu")
        self.t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_path)
        self.blip_processor = BlipProcessor.from_pretrained(blip_model_path)
        self.blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_path).to("cuda" if torch.cuda.is_available() else "cpu")

    def preprocess_text(self, text):
        clean_text = ''.join(e for e in text if e.isalnum() or e.isspace())
        return clean_text

    def detect_and_localize(self, image):
        image_np = np.array(image)
        ocr_results = self.ocr_model.readtext(image_np)
        return [{"coordinates": result[0], "text": result[1]} for result in ocr_results]

    def generate_caption(self, image):
        inputs = self.blip_processor(images=image, return_tensors="pt").to(self.blip_model.device)
        with torch.no_grad():
            outputs = self.blip_model.generate(**inputs)
        caption = self.blip_processor.decode(outputs[0], skip_special_tokens=True)
        return caption

    def correct_text(self, text, caption):
        input_text = f"Context: {caption}. Correct this text: {text}"
        input_ids = self.t5_tokenizer(input_text, return_tensors="pt").input_ids.to(self.t5_model.device)

        outputs = self.t5_model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
        corrected_text = self.t5_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

        print(f"Original: '{text}' -> Corrected: '{corrected_text}'")
        return corrected_text if corrected_text.strip() and corrected_text.lower() != text.lower() else text

    def create_mask(self, image_size, coordinates):
        mask = Image.new('L', image_size, 0)
        draw = ImageDraw.Draw(mask)
        draw.polygon([tuple(point) for point in coordinates], outline=255, fill=255)
        return mask

    def inpaint_text(self, image, mask, prompt):
        image = image.convert("RGB")
        mask = mask.convert("L")
        width, height = image.size
        new_width, new_height = ((width - 1) // 8 + 1) * 8, ((height - 1) // 8 + 1) * 8
        image = image.resize((new_width, new_height), Image.LANCZOS)
        mask = mask.resize((new_width, new_height), Image.LANCZOS)

        inpainted_image = self.inpaint_model(
            prompt=prompt,
            image=image,
            mask_image=mask,
            num_inference_steps=50,
            guidance_scale=7.5
        ).images[0]

        return inpainted_image.resize((width, height), Image.LANCZOS)

    def run_pipeline(self, image):
        text_boxes = self.detect_and_localize(image)
        caption = self.generate_caption(image)
        corrected_image = image.copy()

        for box in text_boxes:
            original_text = self.preprocess_text(box["text"])
            corrected_text = self.correct_text(original_text, caption)

            if corrected_text.strip() == original_text.strip():
                continue

            mask = self.create_mask(image.size, box["coordinates"])
            inpaint_prompt = f"High-quality text saying '{corrected_text}' in a style matching the image."
            inpainted_region = self.inpaint_text(corrected_image, mask, inpaint_prompt)

            full_mask = Image.new('L', corrected_image.size, 0)
            full_mask.paste(mask, (0, 0))
            corrected_image.paste(inpainted_region, (0, 0), full_mask)

        return corrected_image

# Example usage
if __name__ == "__main__":
    text_corrector = EnhancedTextCorrector(
        inpaint_model_path="stabilityai/stable-diffusion-2-inpainting"
    )

    input_image = Image.open("/content/Firefly-photograph-of-a-street-sign-on-a-busy-road-near-a-billboard-that-says-hello-techcrunch-reade.jpg.webp")
    output_image = text_corrector.run_pipeline(input_image)
    output_image.save("enhanced_firefly.png")


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

Original: 'HELLLOO' -> Corrected: 'HELLLOO'
Original: 'PIESPSPARE' -> Corrected: 'PIESPSPARE'
Original: 'PPPDUD' -> Corrected: 'PPPDUD'
Original: 'Honanee' -> Corrected: 'Honanee'
Original: '20' -> Corrected: 'o'clock'


  0%|          | 0/50 [00:00<?, ?it/s]

Original: 'Adobe Fir' -> Corrected: 'Adobe Fir'
