## **Microsoft/Kosmos-2.5-Demo**

Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared decoder-only auto-regressive Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.

`Minimum Accelerator Needed: 1x T4 (*Free Tier GPU)`

*notebook by : [prithivMLmods](https://huggingface.co/prithivMLmods)🤗*

### **Install Packages**

In [None]:
%%capture
!pip install git+https://github.com/huggingface/transformers.git \
             git+https://github.com/huggingface/accelerate.git \
             git+https://github.com/huggingface/peft.git \
             transformers-stream-generator huggingface_hub albumentations \
             pyvips-binary qwen-vl-utils sentencepiece opencv-python docling-core \
             python-docx torchvision safetensors matplotlib num2words \

!pip install xformers requests pymupdf hf_xet spaces pyvips pillow gradio \
             einops torch fpdf timm av decord bitsandbytes reportlab
#Hold tight, this will take around 2-3 minutes.

### **Run Microsoft-Kosmos-2.5 Demo**

In [None]:
import spaces
import torch
import gradio as gr
from PIL import Image, ImageDraw
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import re

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

# ---------------------------
# Load Models Once (Startup)
# ---------------------------
print("Loading base model...")
base_repo = "microsoft/kosmos-2.5"
base_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
    base_repo,
    device_map=device,
    dtype=dtype
)
base_processor = AutoProcessor.from_pretrained(base_repo)

print("Loading chat model...")
chat_repo = "microsoft/kosmos-2.5-chat"
chat_model = Kosmos2_5ForConditionalGeneration.from_pretrained(
    chat_repo,
    device_map=device,
    dtype=dtype
)
chat_processor = AutoProcessor.from_pretrained(chat_repo)


# ---------------------------
# Utility Functions
# ---------------------------
def post_process_ocr(y, scale_height, scale_width, prompt="<ocr>"):
    y = y.replace(prompt, "")
    if "<md>" in prompt:
        return y

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxs_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]
    bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
    bboxs = [[int(j) for j in i] for i in bboxs]

    info = ""
    for i in range(len(lines)):
        if i < len(bboxs):
            box = bboxs[i]
            x0, y0, x1, y1 = box
            if not (x0 >= x1 or y0 >= y1):
                x0 = int(x0 * scale_width)
                y0 = int(y0 * scale_height)
                x1 = int(x1 * scale_width)
                y1 = int(y1 * scale_height)
                info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}\n"
    return info.strip()


# ---------------------------
# Inference Functions
# ---------------------------
@spaces.GPU
def generate_markdown(image):
    if image is None:
        return "Please upload an image."

    prompt = "<md>"
    inputs = base_processor(text=prompt, images=image, return_tensors="pt")

    height, width = inputs.pop("height"), inputs.pop("width")
    raw_width, raw_height = image.size
    scale_height = raw_height / height
    scale_width = raw_width / width

    inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
    inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)

    with torch.no_grad():
        generated_ids = base_model.generate(
            **inputs,
            max_new_tokens=1024,
        )

    generated_text = base_processor.batch_decode(generated_ids, skip_special_tokens=True)
    result = generated_text[0].replace(prompt, "").strip()

    return result


@spaces.GPU
def generate_ocr(image):
    if image is None:
        return "Please upload an image.", None

    prompt = "<ocr>"
    inputs = base_processor(text=prompt, images=image, return_tensors="pt")

    height, width = inputs.pop("height"), inputs.pop("width")
    raw_width, raw_height = image.size
    scale_height = raw_height / height
    scale_width = raw_width / width

    inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
    inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)

    with torch.no_grad():
        generated_ids = base_model.generate(
            **inputs,
            max_new_tokens=1024,
        )

    generated_text = base_processor.batch_decode(generated_ids, skip_special_tokens=True)

    # Post-process OCR output
    output_text = post_process_ocr(generated_text[0], scale_height, scale_width)

    # Create visualization
    vis_image = image.copy()
    draw = ImageDraw.Draw(vis_image)

    lines = output_text.split("\n")
    for line in lines:
        if not line.strip():
            continue
        parts = line.split(",")
        if len(parts) >= 8:
            try:
                coords = list(map(int, parts[:8]))
                draw.polygon(coords, outline="red", width=2)
            except Exception:
                continue

    return output_text, vis_image


@spaces.GPU
def generate_chat_response(image, question):
    if image is None:
        return "Please upload an image."
    if not question.strip():
        return "Please ask a question."

    template = "<md>A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
    prompt = template.format(question)

    inputs = chat_processor(text=prompt, images=image, return_tensors="pt")

    height, width = inputs.pop("height"), inputs.pop("width")
    raw_width, raw_height = image.size
    scale_height = raw_height / height
    scale_width = raw_width / width

    inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
    inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)

    with torch.no_grad():
        generated_ids = chat_model.generate(
            **inputs,
            max_new_tokens=1024,
        )

    generated_text = chat_processor.batch_decode(generated_ids, skip_special_tokens=True)

    # Extract only the assistant's response
    result = generated_text[0]
    if "ASSISTANT:" in result:
        result = result.split("ASSISTANT:")[-1].strip()

    return result


# ---------------------------
# Gradio UI
# ---------------------------
with gr.Blocks(theme="bethecloud/storj_theme") as demo:
    gr.HTML("""
    <div class="title" style="text-align: center">
        <h1>Microsoft-Kosmos-2.5-Demo</h1>
        <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
            Using Microsoft's Kosmos-2.5 for Image Content Extraction and Understanding
        </p>
    </div>
    """)

    with gr.Tabs():
        # Markdown Generation Tab
        with gr.TabItem("Markdown Generation"):
            with gr.Row():
                with gr.Column():
                    md_image = gr.Image(type="pil", label="Upload Document Image")
                    gr.Examples(
                        examples=["https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"],
                        inputs=md_image
                    )
                    md_button = gr.Button("Generate Markdown", variant="primary")
                with gr.Column():
                    md_output = gr.Textbox(
                        label="Generated Markdown",
                        lines=15,
                        max_lines=20,
                        show_copy_button=True
                    )

        # OCR Tab
        with gr.TabItem("OCR with Bounding Boxes"):
            with gr.Row():
                with gr.Column():
                    ocr_image = gr.Image(type="pil", label="Upload Document Image")
                    gr.Examples(
                        examples=["https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"],
                        inputs=ocr_image
                    )
                    ocr_button = gr.Button("Extract Text with Coordinates", variant="primary")
                with gr.Column():
                    with gr.Row():
                        ocr_text = gr.Textbox(
                            label="Extracted Text with Coordinates",
                            lines=10,
                            show_copy_button=True
                        )
                        ocr_vis = gr.Image(label="Visualization (Red boxes show detected text)")

        # Chat Tab
        with gr.TabItem("Document Q&A (Chat)"):
            with gr.Row():
                with gr.Column():
                    chat_image = gr.Image(type="pil", label="Upload Document Image")
                    gr.Examples(
                        examples=["https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"],
                        inputs=chat_image
                    )
                    chat_question = gr.Textbox(
                        label="Ask a question about the document",
                        placeholder="e.g., What is the total amount on this receipt?",
                        lines=2
                    )
                    gr.Examples(
                        examples=[
                            "What is the total amount on this receipt?",
                            "What items were purchased?",
                            "When was this receipt issued?",
                            "What is the subtotal?"
                        ],
                        inputs=chat_question
                    )
                    chat_button = gr.Button("Get Answer", variant="primary")
                with gr.Column():
                    chat_output = gr.Textbox(
                        label="Answer",
                        lines=8,
                        show_copy_button=True
                    )

    # Event handlers
    md_button.click(
        fn=generate_markdown,
        inputs=[md_image],
        outputs=[md_output]
    )

    ocr_button.click(
        fn=generate_ocr,
        inputs=[ocr_image],
        outputs=[ocr_text, ocr_vis]
    )

    chat_button.click(
        fn=generate_chat_response,
        inputs=[chat_image, chat_question],
        outputs=[chat_output]
    )

if __name__ == "__main__":
    demo.launch(debug=True)

### **Demo Inference Image**


![Screenshot 2025-09-03 at 18-24-36 Gradio.png](https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/gcIoHenyx3sem1qNzuFgT.png)
