In [1]:
import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq
from qwen_vl_utils import process_vision_info
from pdf2image import convert_from_path
from byaldi import RAGMultiModalModel
import torch
from PIL import Image

In [2]:
model = AutoModelForVision2Seq.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    trust_remote_code=True,
    torch_dtype=torch.float32
).eval()

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")

Verbosity is set to 1 (active). Pass verbose=0 to make quieter.


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
def resize_image(image, max_size=(800, 800)):
    image.thumbnail(max_size, Image.Resampling.LANCZOS)  # Resize the image while maintaining aspect ratio
    return image

In [24]:
def extract_text(image):
    # Prepare the image and text as inputs for Huggingface model
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image}
        ]
    }]
    
    # Prepare the text and image inputs for the model
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    
    # Generate output from Huggingface model on CPU
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    )  # No need for .to("cuda"), it defaults to CPU

    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    extracted_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)
    
    return extracted_text[0]

In [40]:
def search_keyword(extracted_text, search_query):
    # Normalize both extracted text and search query
    normalized_text = extracted_text.lower()
    normalized_query = search_query.lower()

    # Check for the presence of the normalized query in the normalized text
    if normalized_query in normalized_text:
        return f'Keyword "{search_query}" found in text!'

    # If not found, provide feedback
    return f'Keyword "{search_query}" not found in text.'

In [41]:
def structure_text(extracted_text):
    # Ensure we split by paragraph rather than assuming a table format
    structured_text = extracted_text.split('\n\n')  # Split by double newlines (paragraph breaks)
    
    # Prepare structured text output (no table assumption)
    structured_output = '\n\n'.join([f'Paragraph {i+1}: {para.strip()}' for i, para in enumerate(structured_text) if para.strip()])
    
    # Convert structured text to JSON format (no table assumption)
    json_output = {"paragraphs": [{"id": i+1, "content": para.strip()} for i, para in enumerate(structured_text) if para.strip()]}
    
    return structured_output, json_output


In [42]:
def ocr_and_search(image, search_query):
    # Resize the image before extracting text
    resized_image = resize_image(image)
    
    # Extract text from the resized image
    extracted_text = extract_text(resized_image)
    
    # Perform the search within the extracted text
    search_result = search_keyword(extracted_text, search_query)
    
    # Structure the extracted text and convert to JSON format
    structured_text, json_output = structure_text(extracted_text)
    
    return extracted_text, search_result, structured_text, json_output

In [43]:
interface = gr.Interface(
    fn=ocr_and_search,
    inputs=[
        gr.Image(type="pil", label="Upload Image (JPEG, PNG, etc.)"),  # Allow users to upload an image in common formats
        gr.Textbox(label="Enter keyword for search")  # Input for search keyword
    ],
    outputs=[
        gr.Textbox(label="Extracted Text"),  # Display the extracted text
        gr.Textbox(label="Search Result"),  # Display the search result
        gr.Textbox(label="Structured Text"),  # Display structured extracted text
        gr.JSON(label="JSON Output")  # Display JSON format of the structured text
    ],
    title="OCR and Search with Image Resizing and Text Structuring",
    description="Upload an image containing Hindi/English text, resize it for optimal performance, extract the text, structure it, and convert to JSON format. You can also search for a keyword in the extracted text."
)

In [44]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7866
Running on public URL: https://4a7b64094ee545edb9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


