In [1]:
!pip install -q gradio transformers torch pillow python-dotenv huggingface_hub docling-core opencv-python-headless torch_xla

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m371.1 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.2/322.2 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.0/50.0 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.7/73.7 kB[0m [31m959.8 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.3/130.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

In [5]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
import gradio as gr
import time
import pandas as pd

# Model Initialization
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForVision2Seq.from_pretrained("ds4sd/SmolDocling-256M-preview").to(device)

# Processing function
def process_image(image, task_type):
    start_time = time.time()

    messages = [{
        "role": "user",
        "content": [{"type": "image"}, {"type": "text", "text": task_type}]
    }]

    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)

    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    prompt_length = inputs.input_ids.shape[1]
    trimmed_generated_ids = generated_ids[:, prompt_length:]

    doctags = processor.batch_decode(trimmed_generated_ids, skip_special_tokens=False)[0].lstrip()
    doctags = doctags.replace("<end_of_utterance>", "").strip()

    if task_type.strip().lower() == "convert formula to latex.":
        md_content = f"$$\n{doctags}\n$$"
    else:
        doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
        doc = DoclingDocument(name="Document")
        doc.load_from_doctags(doctags_doc)
        md_content = doc.export_to_markdown()

    processing_time = time.time() - start_time
    return doctags, md_content, f"{processing_time:.2f} seconds"

# Batch processing function
def process_multiple_images(images, task_type):
    results = []
    for idx, image in enumerate(images, 1):
        doctags, md_content, processing_time = process_image(image, task_type)
        results.append({
            "Image #": idx,
            "DocTags": doctags,
            "Markdown": md_content,
            "Processing Time": processing_time
        })
    return results

# Gradio Interface Function
def gradio_fn(upload_type, single_image, multiple_images, task):
    if upload_type == "Single Image" and single_image:
        doctags, md_content, proc_time = process_image(single_image, task)
        return doctags, md_content, proc_time, None
    elif upload_type == "Multiple Images" and multiple_images:
        all_results = process_multiple_images(multiple_images, task)
        combined_doctags = "\n\n---\n\n".join([res["DocTags"] for res in all_results])
        combined_markdown = "\n\n---\n\n".join([res["Markdown"] for res in all_results])
        total_time = sum([float(res["Processing Time"].split()[0]) for res in all_results])
        df_results = pd.DataFrame(all_results)
        return combined_doctags, combined_markdown, f"{total_time:.2f} seconds", df_results
    else:
        return "", "", "No image uploaded.", None

# Task options with icons for better UX
task_options = [
    "📄 Convert this page to docling.",
    "📊 Convert this table to OTSL.",
    "💻 Convert code to text.",
    "🧮 Convert formula to latex.",
    "📈 Convert chart to OTSL.",
    "📑 Extract all section header elements on the page."
]

with gr.Blocks(theme="soft") as demo:
    # Header Section
    gr.Markdown(
        """
        <div style='text-align: center;'>
            <h1>🚀 SmolDocling OCR Application </h1>
            <p>Extract text, tables, formulas, and more from images with ease.</p>
            <img src='https://huggingface.co/front/assets/huggingface_logo.svg' width='150'>
        </div>
        """
    )

    # Input Section
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🖼️ Upload Image(s)")
            upload_type = gr.Radio(
                ["Single Image", "Multiple Images"],
                label="Choose Upload Type",
                value="Single Image",
                elem_id="upload_type_radio",
                info="Select whether to upload one or multiple images"
            )
            single_image = gr.Image(
                type="pil",
                label="Upload Single Image",
                visible=True,
                elem_id="single_image_input",
                show_download_button=True
            )
            multiple_images = gr.File(
                file_count="multiple",
                file_types=["image"],
                label="Upload Multiple Images",
                visible=False,
                elem_id="multiple_images_input"
            )

        with gr.Column(scale=1):
            gr.Markdown("### ⚙️ Task Selection")
            task = gr.Dropdown(
                choices=task_options,
                label="Select OCR Task",
                value=task_options[0],
                elem_id="task_dropdown",
                info="Choose the type of OCR task to perform"
            )
            submit_btn = gr.Button(
                "🚀 Process Images",
                variant="primary",
                elem_id="submit_btn"
            )

    # Output Section
    gr.Markdown("### 📤 Results")
    with gr.Tabs():
        with gr.TabItem("🔍 DocTags"):
            doctags_output = gr.Textbox(
                label="Extracted DocTags",
                placeholder="DocTags will appear here after processing...",
                lines=10,
                elem_id="doctags_output",
                show_copy_button=True
            )
        with gr.TabItem("📜 Markdown"):
            markdown_output = gr.Markdown(
                label="Markdown Output",
                value="Markdown output will appear here after processing..."
            )
        with gr.TabItem("⏱️ Processing Time"):
            proc_time_output = gr.Textbox(
                label="Total Processing Time",
                placeholder="Processing time will appear here after processing...",
                elem_id="proc_time_output"
            )
        with gr.TabItem("📋 Detailed Results (Batch)"):
            df_output = gr.Dataframe(
                label="Detailed Results for Multiple Images",
                elem_id="df_output"
            )

    # Dynamic visibility logic for Single vs Multiple Image Upload
    def update_visibility(upload_type):
        return {
            single_image: gr.update(visible=upload_type == "Single Image"),
            multiple_images: gr.update(visible=upload_type == "Multiple Images")
        }

    upload_type.change(
        fn=update_visibility,
        inputs=upload_type,
        outputs=[single_image, multiple_images]
    )

    # Submit Button Action
    submit_btn.click(
        fn=gradio_fn,
        inputs=[upload_type, single_image, multiple_images, task],
        outputs=[doctags_output, markdown_output, proc_time_output, df_output]
    )

# Launch the demo
demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8d5ee4883e58f1affa.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


