**docling**

In [3]:
import logging
import time
from pathlib import Path

In [4]:
!pip install docling



In [5]:
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem

In [6]:
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

In [7]:
_log = logging.getLogger(__name__)

In [8]:
IMAGE_RESOLUTION_SCALE = 2.0

In [9]:
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path("in.pdf")
    output_dir = Path("/scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Save markdown with embedded pictures
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

    # Save markdown with externally referenced pictures
    md_filename = output_dir / f"{doc_filename}-with-image-refs.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

    # Save HTML with externally referenced pictures
    html_filename = output_dir / f"{doc_filename}-with-image-refs.html"
    conv_res.document.save_as_html(html_filename, image_mode=ImageRefMode.REFERENCED)

    end_time = time.time() - start_time

    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")


In [10]:
if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


In [11]:
from huggingface_hub import login

# Replace 'your-huggingface-token' with your actual token
login(token="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")


In [12]:
import torch
from transformers import AutoConfig, AutoModel, AutoTokenizer


# Set up the model and tokenizer
model_path = 'h2oai/h2ovl-mississippi-800m'
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    config=config,
    low_cpu_mem_usage=True,
    trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
generation_config = dict(max_new_tokens=2048, do_sample=True)



FlashAttention is not installed.


In [21]:
import re
from PIL import Image
with open(r"/scratch/in-with-image-refs.md", "r", encoding="utf-8") as file:
    content = file.read()

# Correct regex pattern
pattern = r"!\[Image\]\(([^)]+)\)"

# Function to extract text from an image using LLaMA 3.2 Vision
def extract_text(image_path):
    image_file = image_path
    question = '<image>\nExplain the image details'
    response, history = model.chat(tokenizer, image_file, question, generation_config, history=None, return_history=True)
    print(response)
    return response

# Replace images with extracted text
def replace_images_with_text(md_content):
    matches = re.findall(pattern, md_content)

    for image_path in matches:
        try:
            print(image_path)
            text = extract_text(image_path)
            md_content = md_content.replace(f"![image]({image_path})", f"![image]({text})")
        except Exception as e:
            print(f"Error processing {image_path}: {e}")

    return md_content

# Process Markdown and save
updated_content = replace_images_with_text(content)

with open("example_updated.md", "w", encoding="utf-8") as file:
    file.write(updated_content)

print("Updated Markdown saved as 'example_updated.md'.")

/scratch/in-with-image-refs_artifacts/image_000000_c0b46806f193da768c0630099809b2c20c5b4a28f6cb9c17cc8deab40b1a3071.png
The image is a screenshot of the "Your Campaigns" page on the Mukund cake shop website. The page has a simple design with a gray background, and the navigation bar at the top contains icons that represent different sections of the page. 

The page header shows an option to "Create new campaign," with a blue button that says "Create new campaign." Below this, there are two areas labeled "Search for the campaign" and "All Platform." A search bar is present in the middle of the page, and below it, there's a list of ten options related to specific products and their corresponding "Date Range," "Clicks," "Budget," and "Location."

The "Campaigns" section contains two columns. On the left, there's a column for "On/Off" campaigns, with three options listed: "Bluberry cake Campaign," "Chocolate cake campaign," and "Brownie cake campaign." For each option, there's a list of ca

In [None]:
def process_image_path(image_path):
    image_file = image_path
    question = '<image>\nExplain the image details'
    response, history = model.chat(tokenizer, image_file, question, generation_config, history=None, return_history=True)
    print(response)
    return response

# Regex pattern to detect ![Image](content)
pattern = r"!\[Image\]\((.*?)\)"

# Function to replace content inside ()
def replace_image_paths(md_content):
    def replacement(match):
        original_path = match.group(1)  # Extract content inside ()
        new_content = process_image_path(original_path)  # Call function
        return f"![Image]({new_content})"  # Replace with new content

    return re.sub(pattern, replacement, md_content)

# Read Markdown file
with open(r"/scratch/in-with-image-refs.md", "r", encoding="utf-8") as file:
    content = file.read()
# Process and save the updated Markdown content
updated_md_text = replace_image_paths(content)

print("Updated Markdown:\n", updated_md_text)


with open("imageread.md", "w", encoding="utf-8") as file:
    file.write(updated_md_text)

The image is a screenshot of the dashboard of a Craigslist listing platform. It shows various campaigns under the "Your Campaigns" section, with details of different types of campaigns, their status, and an option to create a new campaign. The top section of the dashboard has a blue background with white text. Each campaign is listed in a row with its title, campaign date range, status, budget, location, platform, and status. The last row of the dashboard is empty. There are also three status bars to the left side of the screen with the status of "All" and "Live now".
The illustration displays a blue button with white text on it, titled "Create new campaign". The button is positioned in the upper center area of the image and is accompanied by the button symbol.
This screenshot is of a website's advertisement for an Ad Campaign that allows customers to launch their ad campaign in just 4 easy steps. The website displays an image of the campaign's button.
This image is a screenshot of a w