In [None]:
#| default_exp testbed/ocr_idefics

In [None]:
#| export

from __future__ import annotations


In [None]:
#| hide
# %reload_ext autoreload
# %autoreload 0


# Testing `Idefics` OCR for Comics
> Accuracy Enhancements for OCR in `PanelCleaner`


# Prologue

In [None]:
#| export
import functools
import subprocess
from pathlib import Path
from typing import Any
from typing import Literal
from typing import TypeAlias

import pcleaner.config as cfg
import pcleaner.ocr.ocr as ocr
import torch
from pcleaner.ocr.ocr_tesseract import TesseractOcr
from PIL import Image
from rich.console import Console
from transformers import AutoProcessor
from transformers import Idefics2ForConditionalGeneration


In [None]:
import os
import re
import sys
from typing import cast

import fastcore.all as FC
import fastcore.xtras  # patch pathlib.Path with some utils
import transformers
from fastcore.test import *  # type: ignore


need version >4.40 of transformers

In [None]:
transformers.__version__

'4.40.2'

In [None]:
# %pip install git+https://github.com/huggingface/transformers

Fash attention doesn't support Metal [#412](https://github.com/Dao-AILab/flash-attention/issues/412) (but see [metal-flash-attention](https://github.com/philipturner/metal-flash-attention))



In [None]:
# %env FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE
# %pip install flash-attn --no-build-isolation

# Helpers

In [None]:
# pretty print by default
# %load_ext rich

In [None]:
#| exporti

console = Console(width=104, tab_size=4, force_jupyter=True)
cprint = console.print


Force reload of `experiments` module

In [None]:
if 'pcleaner._testbed.testbed.experiments' in sys.modules:
    import importlib; importlib.reload(pcleaner._testbed.testbed.experiments)  # type: ignore
else:
    import pcleaner._testbed.testbed.experiments
    from pcleaner._testbed.testbed.experiments import *


In [None]:
#| exporti

import pcleaner._testbed.testbed.experiments as exp_testbed
from pcleaner._testbed.testbed.experiments import *
from pcleaner._testbed.testbed.helpers import RenderJSON
import pcleaner._testbed.testbed.web_server as web_server


In [None]:
#| exporti

def load_image(img_or_path) -> Image.Image:
    if isinstance(img_or_path, (str, Path)):
        return Image.open(img_or_path)
    elif isinstance(img_or_path, Image.Image):
        return img_or_path
    else:
        raise ValueError(f"img_or_path must be a path or PIL.Image, got: {type(img_or_path)}")


In [None]:
#| exporti

def get_gpu_vram(total=True):
    if total:
        command = "nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits"
    else:
        command = "nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits"
    try:
        vram = subprocess.check_output(command, shell=True).decode('utf-8').strip()
        return vram
    except subprocess.CalledProcessError:
        return "Failed to get VRAM"


# GPU

In [None]:
!nvidia-smi

Wed May 22 11:58:01 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.08             Driver Version: 535.161.08   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090 Ti     On  | 00000000:65:00.0 Off |                  Off |
|  0%   46C    P8              23W / 480W |      3MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
cprint(f"Nvidia card total VRAM: {get_gpu_vram()}  MiB")
cprint(f"Nvidia card current VRAM: {get_gpu_vram(False)}  MiB")

----
# Idefics basic usage

not working, cuda memory error

In [None]:
# # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
# # image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
# # image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
# # image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

# image1 = Image.open("media/Statue-of-Liberty-Island-New-York-Bay.webp")
# image2 = Image.open("media/Skyline-Chicago.webp")
# image3 = Image.open("media/Golden-Gate-Bridge-San-Francisco.webp")

In [None]:
# processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")

In [None]:
# model = Idefics2ForConditionalGeneration.from_pretrained(
#         "HuggingFaceM4/idefics2-8b",
#         torch_dtype=torch.bfloat16,
#         #_attn_implementation="flash_attention_2",
#         )


In [None]:
# assert isinstance(model, PreTrainedModel)
# model.to(DEVICE)
# type(model), model.device


Create inputs

In [None]:
# messages = [
#     {
#         "role": "user",
#         "content": [
#             {"type": "image"},
#             {"type": "text", "text": "What do we see in this image?"},
#         ]
#     },
#     {
#         "role": "assistant",
#         "content": [
#             {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
#         ]
#     },
#     {
#         "role": "user",
#         "content": [
#             {"type": "image"},
#             {"type": "text", "text": "And how about this image?"},
#         ]
#     },       
# ]


In [None]:
# prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(text=prompt, images=[image1, image2], return_tensors="pt")
# inputs = {k: v.to(DEVICE) for k, v in inputs.items()}


Generate

In [None]:
# generated_ids = model.generate(**inputs, max_new_tokens=500)
# generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

# print(generated_texts)
# # ['User: What do we see in this image? \nAssistant: In this image, we can see the city of New York, and more specifically the Statue of Liberty. \nUser: And how about this image? \nAssistant: In this image we can see buildings, trees, lights, water and sky.']


In [None]:
# [
#     'User: What do we see in this image? '
#     'Assistant: In this image, we can see the city of New York, and more specifically the Statue of Liberty. '
#     'User: And how about this image? '
#     'Assistant: In this image we can see buildings, trees, lights, water and sky.'
#     ]

----
# Idefics experiments


# Experiment directory

In [None]:
EXP_DIR = "../experiment"


# Setup ngrok (Colab)

The experiments can generate hundreds of images, and maintaining the **PIL** images in memory is not efficient. All the generated images are cached and visualized on demand through a URL pointing to the local cache. This approach prevents the kernel from being overloaded with **PIL** images, with the front-end responsible for fetching the image and the backend web server (not the kernel) for serving the image in another process. This method is quick and efficient. As an added bonus, the saved notebook remains lean and fit; it doesn't store the Base64 versions of all the output cell images.

Unfortunately, this approach does not work as is in **Colab**. Google Colab runs on an older Ubuntu 18.04 VM, so all the usual networking challenges with Docker, or whatever VMs Google is using, apply. Google also goes to great lengths to avoid exposing its internal architecture. We have two options:
- Let the Jupyter kernel serve the images itself, which is slow and memory-consuming.
- Use a tunnel to map localhost (server) to whatever IP and port the front-end (the browser you're currently using) is running on. We can use **ngrok** for this, but *ngrok* is a commercial service that has been abused and now requires confirmation the first time the tunnel connects, which can be inconvenient for the user. It also requires the user to open a free account and obtain an auth token.

You choose.

If the notebook is running in Colab and ngrok has been successfully installed and the tunnel has been created, the default setting is USE_PIL=False. You can set the environment variable USE_PIL=True to force the use of PIL images, but note that in certain circumstances, Colab will complain because the free tiers are usually memory constrained.


In [None]:
os.environ['USE_PIL'] = 'False'
os.environ['USE_TUNNEL'] = 'False'


In [None]:
SERVER = None
if (os.environ['USE_PIL'].lower() == 'false') and os.environ['USE_TUNNEL'].lower() == 'true':
    SERVER = web_server.setup_ngrok(web_server.WebServerBottle, Path(EXP_DIR))


# Idefics

## Idefics initialization

In [None]:
#| exporti

def _setup_processor():
    return AutoProcessor.from_pretrained(
        "HuggingFaceM4/idefics2-8b", 
        do_image_splitting=False  #  cropped boxes are usually small
        )

### Quantization

In [None]:
#| exporti

QuantT: TypeAlias = Literal['bfloat16'] | Literal['8bits'] | Literal['4bits']

def _setup_model(quant: QuantT, flashattn: bool=True):
    kwargs: dict = dict(
        torch_dtype=torch.bfloat16,
    )
    if quant == 'bfloat16':
        pass
    else:
        from transformers import BitsAndBytesConfig
        quantization_config = None
        if quant == '8bits':
            quantization_config = BitsAndBytesConfig(
                load_in_8bit=True,
            )
        if quant == '4bits':
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.float16
            )
        if quantization_config is not None:
            kwargs.update(quantization_config=quantization_config)
    if flashattn:
        kwargs.update(_attn_implementation="flash_attention_2")
    model = Idefics2ForConditionalGeneration.from_pretrained(
        "HuggingFaceM4/idefics2-8b", 
        device_map='auto', 
        **kwargs)
    return model

In [None]:
#| exporti

prompt_text_tmpl = (
        "Please perform optical character recognition (OCR) on this image, which displays "
        "speech balloons from a comic book. The text is in {}. Extract the text and "
        "format it as follows: transcribe in standard sentence case, avoid using all capital "
        "letters. Provide the transcribed text clearly and double check the sentence is not all capital letters.")

# prompt_text_tmpl = ("Please perform optical character recognition (OCR) on this image, which displays "
#         f"speech balloons from a manga comic. The text is in {}. Extract the text and "
#         "format it without newlines. Provide the transcribed text clearly.")

# prompt_text_tmpl = ("Please perform optical character recognition (OCR) on this image, which displays "
#         "speech balloons from a comic book. The text is in {}. Extract the text and "
#         "format it as follows: transcribe in standard sentence case (avoid using all capital "
#         "letters) and use asterisks to denote any words that appear in bold within the image. "
#         "Provide the transcribed text clearly.")

# prompt_text_tmpl = ("Please perform optical character recognition (OCR) on this image, which displays "
#         "speech balloons from a comic book. The text is in {}. Extract the text and "
#         "format it as follows: transcribe in standard sentence case, capitalized. Avoid using "
#         "all capital letters. In comics, it is common to use two hyphens '--' to interrupt a sentence. "
#         "Retain any hyphens as they appear in the original text. Provide the transcribed text "
#         "clearly, ensuring it is capitalized where appropriate, including proper nouns.")

prompt_text_tmpl = (
        "Please perform optical character recognition (OCR) on this image, which displays "
        "speech balloons from a comic book. The text is in {}. Extract the text and "
        "format it as follows: transcribe in standard sentence case, capitalized. Avoid using "
        "all capital letters, but ensure it is capitalized where appropriate, including proper nouns. "
        "Provide the transcribed text clearly. Double check the text is not all capital letters.")


# prompt_text_tmpl = (
#         "Please perform optical character recognition (OCR) on this image, which contains speech "
#         "balloons from a comic book. The text is in English. Carefully transcribe the text, "
#         "ensuring that you preserve the original formatting and line breaks as they appear "
#         "in the speech balloon."
# )

default_prompt_text_tmpl = prompt_text_tmpl

## IdeficsOCR

In [None]:
#| export

class IdeficsOCR:
    prompt_text_tmpl: str = default_prompt_text_tmpl
    PROCESSOR: Any = None
    MODEL: Any = None


    @classmethod
    def setup_processor(cls):
        cls.PROCESSOR = _setup_processor()
        return cls.PROCESSOR
    
    @classmethod
    def setup_model(cls, quant: QuantT='bfloat16', flashattn: bool=True):
        cls.MODEL = _setup_model(quant, flashattn)
        return cls.MODEL
    
    @staticmethod
    def is_idefics_available() -> bool:
        return True

    def show_info(self):
        cprint(
            f"{'model':>17}: {type(self.MODEL)}\n"
            f"{'quantization':>17}: {type(self.quant)}\n"
            f"{'device':>17}: {repr(self.MODEL.device)}\n"
            f"{'current VRAM':>17}: {get_gpu_vram(False)}  MiB\n"
    )


    def __init__(self, 
            lang: str | None = None, 
            prompt_text_tmpl: str|None = None, 
            quant: QuantT | None = None,
            flashattn: bool | None = None,
        ):
        self.lang = lang
        self.prompt_text_tmpl = prompt_text_tmpl or self.prompt_text_tmpl
        self.quant = quant or 'bfloat16'#'4bits'
        self.flashattn = flashattn or True
        if self.PROCESSOR is None:
            type(self).setup_processor()
        if self.MODEL is None:
            type(self).setup_model(self.quant, self.flashattn)
        self.device = self.MODEL.device

    def _generation_args(self, image: Image.Image, resulting_messages: list[dict]):
        prompt = self.PROCESSOR.apply_chat_template(resulting_messages, add_generation_prompt=True)
        inputs = self.PROCESSOR(text=prompt, images=[image], return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        max_new_tokens = 512
        repetition_penalty = 1.2
        decoding_strategy = "Greedy"
        temperature = 0.4
        top_p = 0.8

        generation_args = {
            "max_new_tokens": max_new_tokens,
            "repetition_penalty": repetition_penalty,
        }

        assert decoding_strategy in [
            "Greedy",
            "Top P Sampling",
        ]

        if decoding_strategy == "Greedy":
            generation_args["do_sample"] = False
        elif decoding_strategy == "Top P Sampling":
            generation_args["temperature"] = temperature
            generation_args["do_sample"] = True
            generation_args["top_p"] = top_p

        generation_args.update(inputs)
        return prompt, generation_args

    def __call__(
        self,
        img_or_path: Image.Image | Path | str,
        prompt_text: str | None = None,
        lang: str | None = None,
        config: str | None = None,
        show_prompt: bool = False,
        **kwargs,
    ) -> str:
        if not self.is_idefics_available():
            raise RuntimeError("Idefics is not installed or not found.")
        resulting_messages = [
            {
                "role": "user",
                "content": [{"type": "image"}] + [
                    {"type": "text", "text": prompt_text or self.prompt_text_tmpl.format(lang or self.lang)}
                ]
            }
        ]
        image = load_image(img_or_path)
        prompt, generation_args = self._generation_args(image, resulting_messages)
        generated_ids = self.MODEL.generate(**generation_args)
        generated_texts = self.PROCESSOR.batch_decode(
            generated_ids[:, generation_args["input_ids"].size(1):], skip_special_tokens=True)
        if show_prompt:
            cprint("INPUT:", prompt, "|OUTPUT:", generated_texts)
        return generated_texts[0]#.strip('"')

    def postprocess_ocr(self, text):
        return ' '.join(remove_multiple_whitespaces(text).splitlines())


## IdeficsExperimentContext

In [None]:
#| export

class IdeficsExperimentContext(OCRExperimentContext):
    @functools.lru_cache()
    def mocr(self, lang: str):
        if self.ocr_model == 'Idefics':
            proc = IdeficsOCR(lang)
        else:
            engine = self.engines[self.ocr_model]
            ocr_processor = ocr.get_ocr_processor(True, engine)
            proc = ocr_processor[lang2pcleaner(lang)]
            if isinstance(proc, TesseractOcr):
                proc.lang = lang2tesseract(lang)
        return proc

    def cleanup_model(self):
        del IdeficsOCR.MODEL
        torch.cuda.empty_cache()
        import gc
        gc.collect()
        IdeficsOCR.MODEL = None

    def setup_idefics(self, quant: QuantT = 'bfloat16', flashattn: bool = True):
        if IdeficsOCR.PROCESSOR is None:
            IdeficsOCR.setup_processor()
        if IdeficsOCR.MODEL is not None:
            self.cleanup_model()
        if IdeficsOCR.MODEL is None:
            IdeficsOCR.setup_model(quant=quant, flashattn=flashattn)

    def show(self):
        super().show()
        cfg = IdeficsOCR.MODEL.config
        if hasattr(cfg, 'quantization_config'):
            qcfg = cfg.quantization_config
            quant = '4bits' if qcfg.load_in_4bit else '8bits'
        else:
            quant = 'bfloat16'
        cprint(
            f"{'Quantization':>17}: {quant!r}\n"
            f"{'Flash attention 2':>17}: {cfg._attn_implementation == 'flash_attention_2'}\n"
            f"{'VRAM':>17}: {get_gpu_vram(False)}/{get_gpu_vram()} MiB\n"
        )

    def __init__(self, 
            root_dir: Path | str | None = None, 
            quant: QuantT = 'bfloat16', 
            flashattn: bool = True,
            *, 
            config: cfg.Config | None = None, 
            server: web_server.WebServer | None = None,
            run_name: str = 'Idefics-crop-post', 
            setup_idefics: bool = True,
        ):
        super().__init__('Idefics', root_dir, config=config, server=server, run_name=run_name)
        if setup_idefics:
            self.setup_idefics(quant, flashattn)



# Context

In [None]:
CONTEXT = IdeficsExperimentContext(EXP_DIR)  # quantization 'bfloat16'  # Colab pro with A100 or L4, bfloat16 and FlashAttention
# CONTEXT = IdeficsExperimentContext(EXP_DIR, '4bits', False)  # Free tier, T4 GPUs don't support FlashAttention
# CONTEXT = IdeficsExperimentContext(EXP_DIR, '4bits')  # Linux Ampere
CONTEXT.show()


In [None]:
DEVICE = IdeficsOCR.MODEL.device
DEVICE

device(type='cuda', index=0)

# Test images


In [None]:
IMAGE_PATHS = CONTEXT.image_paths

[f"{i:02}: {_.name}" for i,_ in enumerate(IMAGE_PATHS)]


['00: Action_Comics_1960-01-00_(262).JPG',
 '01: Adolf_Cap_01_008.jpg',
 '02: Barnaby_v1-028.png',
 '03: Barnaby_v1-029.png',
 '04: Buck_Danny_-_12_-_Avions_Sans_Pilotes_-_013.jpg',
 '05: Cannon-292.jpg',
 '06: Contrato_con_Dios_028.jpg',
 '07: Erase_una_vez_en_Francia_02_88.jpg',
 '08: FOX_CHILLINTALES_T17_012.jpg',
 '09: Furari_-_Jiro_Taniguchi_selma_056.jpg',
 '10: Galactus_12.jpg',
 '11: INOUE_KYOUMEN_002.png',
 '12: MCCALL_ROBINHOOD_T31_010.jpg',
 '13: MCCAY_LITTLENEMO_090.jpg',
 '14: Mary_Perkins_On_Stage_v2006_1_-_P00068.jpg',
 '15: PIKE_BOYLOVEGIRLS_T41_012.jpg',
 '16: Sal_Buscema_Spaceknights_&_Superheroes_Ocular_Edition_1_1.png',
 '17: Sal_Buscema_Spaceknights_&_Superheroes_Ocular_Edition_1_1_K.png',
 '18: Sal_Buscema_Spaceknights_&_Superheroes_Ocular_Edition_1_2.png',
 '19: Spirou_Et_Fantasio_Integrale_06_1958_1959_0025_0024.jpg',
 '20: Strange_Tales_172005.jpg',
 '21: Strange_Tales_172021.jpg',
 '22: Tarzan_014-21.JPG',
 '23: Tintin_21_Les_Bijoux_de_la_Castafiore_page_39.jp

# EXP_RUN

In [None]:
EXP_RUN = CONTEXT.experiment_run()
assert EXP_RUN is not None
RUN_NAME = EXP_RUN.name
RUN_NAME


'Idefics-crop-post'

# Base image


In [None]:
BASE_IMAGE_IDX: ImgIdT = cast(ImgIdT, CONTEXT.normalize_idx("Strange_Tales_172005.jpg"))

assert BASE_IMAGE_IDX is not None
img_path = CONTEXT.final(CONTEXT.image_paths[BASE_IMAGE_IDX])
assert img_path.exists()

img_visor = ImageContextVisor(CONTEXT, BASE_IMAGE_IDX)
img_visor


Output(layout=Layout(height='0px'))

VBox(children=(VBox(children=(Dropdown(index=20, layout=Layout(width='fit-content'), options={'Action_Comics_1…

Output()

In [None]:
page_lang = 'English'

IMAGE_CONTEXT = ImageContext(CONTEXT, BASE_IMAGE_IDX, page_lang=page_lang)
test_eq(IMAGE_CONTEXT.page_data is not None, True)
RenderJSON(IMAGE_CONTEXT.json_data, 360, 2)


# Box id


In [None]:
BOX_IDX = 0

# Idefics inference

In [None]:
page_lang = IMAGE_CONTEXT.page_lang

resulting_messages = [
    {
        "role": "user",
        "content": [{"type": "image"}] + [
            {"type": "text", "text": prompt_text_tmpl.format(page_lang)}
        ]
    }
]

In [None]:
def idefics_generation_args(image: Image.Image, resulting_messages: list[dict]):
    processor = IdeficsOCR.PROCESSOR
    prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    
    max_new_tokens = 512
    repetition_penalty = 1.2
    decoding_strategy = "Greedy"
    temperature = 0.4
    top_p = 0.8

    generation_args = {
        "max_new_tokens": max_new_tokens,
        "repetition_penalty": repetition_penalty,
    }

    assert decoding_strategy in [
        "Greedy",
        "Top P Sampling",
    ]

    if decoding_strategy == "Greedy":
        generation_args["do_sample"] = False
    elif decoding_strategy == "Top P Sampling":
        generation_args["temperature"] = temperature
        generation_args["do_sample"] = True
        generation_args["top_p"] = top_p

    generation_args.update(inputs)
    return prompt, generation_args


### Crop methods

In [None]:
image_experiment = ExperimentOCR.from_image(CONTEXT, RUN_NAME, IMAGE_CONTEXT.image_idx)


In [None]:
method = CropMethod.INITIAL_BOX

result = cast(ResultOCR, image_experiment.result(BOX_IDX, method, ocr=False))
image = cast(Image.Image, result.image)


In [None]:
prompt, generation_args = idefics_generation_args(image, resulting_messages)
generated_ids = IdeficsOCR.MODEL.generate(**generation_args)

generated_texts = IdeficsOCR.PROCESSOR.batch_decode(
    generated_ids[:, generation_args["input_ids"].size(1):], skip_special_tokens=True)
cprint("INPUT:", prompt, "|OUTPUT:", generated_texts)


In [None]:
result.ocr = generated_texts[0]
result


0,1
,"Embodied by great gnarled cypress trees, the ancient manor stands alone on the outskirts of New Orleans, kept tidy by a white-haired old man known only as Bambu. 0.98"


----

In [None]:
method = CropMethod.INITIAL_BOX

result = cast(ResultOCR, image_experiment.result(BOX_IDX, method, ocr=False))
image = cast(Image.Image, result.image)

mocr: IdeficsOCR = cast(IdeficsOCR, CONTEXT.mocr(page_lang))
text = mocr(image, show_prompt=True)
result.ocr = mocr.postprocess_ocr(text)
result


0,1
,"EMBODIED BY GREAT GARLLED CYPRESS TREES, THE ANCIENT MANOR STANDS ALONE ON THE OUTSKIRTS OF NEW ORLEANS, KEPT TIDY BY A WHITE-HAIRING OLD MAN KNOWN ONLY AS BAMBU. 0.03"


In [None]:
image_experiment.result(BOX_IDX, CropMethod.PADDED_4)

0,1
,"Embowered by great gnarled cypress trees, the ancient manor stands alone on the outskirts of new orleans, kept tidy by a white-haired old man known only as bambu. 0.98"


In [None]:
image_experiment.result(BOX_IDX, CropMethod.PAD_8_FRACT_0_2)

----
# Visualize results

In [None]:
result_visor = ResultVisor(image_experiment)
result_visor


Output(layout=Layout(height='0px'))

HBox(children=(HBox(children=(Label(value='Box # (of 15):', layout=Layout(padding='0px 0px 0px 10px', width='i…

Output()

----
# Visualize Experiment

In [None]:
exp_visor = ExperimentVisor(image_experiment)
exp_visor


VBox(children=(VBox(children=(Dropdown(index=20, layout=Layout(width='fit-content'), options={'Action_Comics_1…

Output()

----
# EEAaO

In [None]:
idefics_experiment = ExperimentsVisor(
                        CONTEXT, 
                        'Idefics', 
                        image_idx=BASE_IMAGE_IDX, 
                        box_idx=13, 
                        method=CropMethod.DEFAULT_GREY_PAD
                    )
idefics_experiment


VBox(children=(HBox(children=(HBox(children=(HBox(children=(Dropdown(layout=Layout(width='fit-content'), optio…

Output()

# Colophon
----


In [None]:
import fastcore.all as FC
from nbdev.export import nb_export


In [None]:
if FC.IN_NOTEBOOK:
    nb_export('ocr_idefics.ipynb', '..')
