In [None]:
# %reload_ext autoreload
# %autoreload 0


## Settings for Google Colab

To efficiently manage the image sources for our experiments, we recommend mounting your Google Drive and storing the experiment files there. If you are not familiar with Colab or Jupyter environments, it's best to leave these settings at their default values to ensure smooth operation.

- Set `MOUNT_DRIVE` to `True` to enable mounting Google Drive in the Colab environment.
This allows the notebook to access files stored in your Google Drive.

- `GDRIVE_MOUNT_POINT` specifies the local directory in Colab where your Google Drive will be mounted.
This acts as the root directory for accessing any files within your Google Drive from the notebook.

- `PANELCLEANER_IN_GDRIVE` specifies the path within your Google Drive where the PanelCleaner project is located.
This path is used to access or store any files related to the PanelCleaner project directly from Google Drive.


In [None]:
MOUNT_DRIVE = True
GDRIVE_MOUNT_POINT = 'drive'
PANELCLEANER_IN_GDRIVE = 'MyDrive/Shared/PanelCleaner'

# install (Colab)


In [None]:
import fastcore.all as FC
import os
import re
import sys
from pathlib import Path

from rich import print as cprint
from rich.text import Text

def info(msg: str):
    (t := Text(msg)).stylize("bold red", 0, 6)
    cprint("_" * 10, t, "_" * 10)


Mount Google Drive

In [None]:
mnt_point = Path(f"/content/{GDRIVE_MOUNT_POINT}")
if FC.IN_COLAB:
    if MOUNT_DRIVE:
        if not mnt_point.exists():
            info("Mounting Google Drive")
            from google.colab import drive
            drive.mount(str(mnt_point), force_remount=True)


### Install **PanelCleaner**

> We will attempt to use the version of **PanelCleaner** stored in your Google Drive. If it's not available, we'll install it from GitHub.

Note that we specifically require the `testbed` branch of the **PanelCleaner** repository, not the main trunk. This branch contains necessary configurations and experimental features that are crucial for the tests conducted in this notebook.

In [None]:
if FC.IN_COLAB:
    pc_path = mnt_point/PANELCLEANER_IN_GDRIVE
    tb_path = pc_path/'pcleaner/_testbed'
    if tb_path.exists():
        info('Installing PanelCleaner from your Google Drive')
    else:
        info('Installing PanelCleaner from GitHub')
        !git clone -b testbed https://github.com/civvic/PanelCleaner.git
        tb_path = Path('PanelCleaner/pcleaner/_testbed')
    assert tb_path.exists(), "PanelCleaner not found"
    os.chdir(tb_path)
    sys.path.append(f"{pc_path}")
    sys.path.append(f"{tb_path}")
    !pip install -q -r requirements-colab.txt


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h

----
# Test ngrok and web server
> OCRExperimentContext with web sever for Colab.


# Prologue

In [None]:
import os
from pathlib import Path
from typing import cast

import fastcore.all as FC
import fastcore.xtras  # patch `Path` with some utils (like `ls()` to list folder contents)
import ipywidgets as W
from fastcore.test import *  # type: ignore
from IPython.display import display
from IPython.display import HTML
from PIL import Image
from rich.console import Console


In [None]:
import testbed.web_server as web_server
from testbed.experiments import CropMethod
from testbed.experiments import ExperimentsVisor
from testbed.experiments import ExperimentContext
from testbed.experiments import OCRExperimentContext


# Helpers

In [None]:
# pretty print by default
# %load_ext rich

In [None]:
console = Console(width=104, tab_size=4, force_jupyter=True)
cprint = console.print

----
# EXP_DIR

In [None]:
EXP_DIR = 'experiment' if FC.IN_COLAB else ExperimentContext.EXP_DIR
assert Path(EXP_DIR).exists()
cprint(Path(EXP_DIR).resolve())


## USE_PIL

The experiments can generate hundreds of images, and maintaining the **PIL** images in memory is not efficient. All the generated images are cached and visualized on demand through a URL pointing to the local cache. This approach prevents the kernel from being overloaded with **PIL** images, with the front-end responsible for fetching the image and the backend web server (not the kernel) for serving the image in another process. This method is quick and efficient. As an added bonus, the saved notebook remains lean and fit; it doesn't store the Base64 versions of all the output cell images.

Unfortunately, this approach does not work as is in **Colab**. Google Colab runs on an older Ubuntu 18.04 VM, so all the usual networking challenges with Docker, or whatever VMs Google is using, apply. Google also goes to great lengths to avoid exposing its internal architecture. We have two options:
- Let the Jupyter kernel serve the images itself, which is slow and memory-consuming.
- Use a tunnel to map localhost (server) to whatever IP and port the front-end (the browser you're currently using) is running on. We can use **ngrok** for this, but *ngrok* is a commercial service that has been abused and now requires confirmation the first time the tunnel connects, which can be inconvenient for the user. It also requires the user to open a free account and obtain an auth token.

You choose.

If the notebook is running in Colab and ngrok has been successfully installed and the tunnel has been created, the default setting is USE_PIL=False. You can set the environment variable USE_PIL=True to force the use of PIL images, but note that in certain circumstances, Colab will complain because the free tiers are usually memory constrained.


In [None]:
os.environ['USE_PIL'] = 'False'
os.environ['USE_TUNNEL'] = 'False'
SERVER = None

# Setup ngrok (Colab)
> Mostly for Colab, but can be forced for local development setting `os.environ['USE_TUNNEL'] = 'True'`.


In [None]:
os.environ['USE_TUNNEL'] = 'True'


In [None]:
if (os.environ['USE_PIL'].lower() == 'false') and os.environ['USE_TUNNEL'].lower() == 'true':
    SERVER = web_server.setup_ngrok(web_server.WebServerBottle, Path(EXP_DIR))


··········


Bottle v0.13-dev server starting up (using WSGIRefServer())...
Listening on http://localhost:42841/
Hit Ctrl-C to quit.



In [None]:
CONTEXT = OCRExperimentContext('Tesseract', EXP_DIR, server=SERVER, load=True)
CONTEXT.show()


No config file found, creating a new one.
Downloading OpenCV model for CPU...


100%|██████████| 94.7M/94.7M [00:01<00:00, 86.3MB/s]
127.0.0.1 - - [31/May/2024 11:54:52] "GET /images/pcleaner.png HTTP/1.1" 404 760


Current Configuration:

Locale: System default
Default Profile: Built-in
Saved Profiles:
(No saved profiles)

Profile Editor: System default
Cache Directory: System default
Default Torch Model Path: Not downloaded
Default CV2 Model Path: /root/.cache/pcleaner/model/comictextdetector.pt.onnx
GUI Theme: System default

--------------------

Config file located at: /root/.config/pcleaner/pcleanerrc
System default cache directory: /root/.cache/pcleaner


In [None]:
[f"{i:02}: {_.name}" for i,_ in enumerate(CONTEXT.image_paths)]

['00: Action_Comics_1960-01-00_(262).JPG',
 '01: Adolf_Cap_01_008.jpg',
 '02: Barnaby_v1-028.png',
 '03: Barnaby_v1-029.png',
 '04: Buck_Danny_-_12_-_Avions_Sans_Pilotes_-_013.jpg',
 '05: Cannon-292.jpg',
 '06: Contrato_con_Dios_028.jpg',
 '07: Erase_una_vez_en_Francia_02_88.jpg',
 '08: FOX_CHILLINTALES_T17_012.jpg',
 '09: Furari_-_Jiro_Taniguchi_selma_056.jpg',
 '10: Galactus_12.jpg',
 '11: INOUE_KYOUMEN_002.png',
 '12: MCCALL_ROBINHOOD_T31_010.jpg',
 '13: MCCAY_LITTLENEMO_090.jpg',
 '14: Mary_Perkins_On_Stage_v2006_1_-_P00068.jpg',
 '15: PIKE_BOYLOVEGIRLS_T41_012.jpg',
 '16: Sal_Buscema_Spaceknights_&_Superheroes_Ocular_Edition_1_1.png',
 '17: Sal_Buscema_Spaceknights_&_Superheroes_Ocular_Edition_1_1_K.png',
 '18: Sal_Buscema_Spaceknights_&_Superheroes_Ocular_Edition_1_2.png',
 '19: Spirou_Et_Fantasio_Integrale_06_1958_1959_0025_0024.jpg',
 '20: Strange_Tales_172005.jpg',
 '21: Strange_Tales_172021.jpg',
 '22: Tarzan_014-21.JPG',
 '23: Tintin_21_Les_Bijoux_de_la_Castafiore_page_39.jp

In [None]:
context: OCRExperimentContext
if SERVER is not None:
    assert SERVER.unc_share is not None and SERVER.running
    context = OCRExperimentContext('Tesseract', EXP_DIR, server=SERVER)
    result = context.result('Tesseract-crop-post', 20, 0, CropMethod.DEFAULT, False)
    public_url: Path = SERVER.unc_share
    img_path = 'cache/Strange_Tales_172005/_crop/Strange_Tales_172005_0_Default.png'
    cprint(f'<img src="{public_url}/{img_path}"/>')
    display(HTML(f'<img src="{public_url}/{img_path}"/>'))


We'll use this little helper function throughout the experiments to get the local path, a server url or a PIL image.


In [None]:
_BCK = CONTEXT.force_PIL, CONTEXT.use_tunnel
DI = CONTEXT.display_image

p = CONTEXT.path_from_idx('Adolf_Cap_01_008.jpg')
cprint(f"{p=}")

CONTEXT.force_PIL, CONTEXT.use_tunnel = False, False
try:
    test_eq(DI(p),  CONTEXT.final(p))
    cprint(f"force_PIL: {CONTEXT.force_PIL}, use_tunnel: {CONTEXT.use_tunnel} => DI: {DI(p)!r}")
except Exception as e: cprint('Error:', e)

CONTEXT.force_PIL, CONTEXT.use_tunnel = False, True
try:
    assert SERVER and SERVER.unc_share
    prefix = SERVER.unc_share if SERVER else ''
    test_eq(DI(p),  prefix/p)
    cprint(f"force_PIL: {CONTEXT.force_PIL}, use_tunnel: {CONTEXT.use_tunnel} => DI: {DI(p)!r}")
except Exception as e: cprint('Error:', e)

CONTEXT.force_PIL, CONTEXT.use_tunnel = True, False
try:
    test_eq(isinstance(DI(p), Image.Image), True)
    cprint(f"force_PIL: {CONTEXT.force_PIL}, use_tunnel: {CONTEXT.use_tunnel} => DI: {type(DI(p))}")
except Exception as e: cprint('Error:', e)

CONTEXT.force_PIL, CONTEXT.use_tunnel = True, True
try:
    test_eq(isinstance(DI(p), Image.Image), True)
    cprint(f"force_PIL: {CONTEXT.force_PIL}, use_tunnel: {CONTEXT.use_tunnel} => DI: {type(DI(p))}")
except Exception as e: cprint('Error:', e)

CONTEXT.force_PIL, CONTEXT.use_tunnel = _BCK


In [None]:
cprint(f"force_PIL: {CONTEXT.force_PIL}, use_tunnel: {CONTEXT.use_tunnel}")

result_img = CONTEXT.cache_dir / 'Strange_Tales_172005/_crop/Strange_Tales_172005_0_Default.png'
cprint(f"{result_img=}")

result_final_img = CONTEXT.final(result_img)
cprint(f"{result_final_img = }")

display_image = CONTEXT.DI(result_img)
cprint(f"{display_image = }") if not CONTEXT.force_PIL else cprint(f"{type(display_image) = }")
display(HTML(f'<img src="{display_image}"/>'))


# EEAaO


In [None]:
tesseract_experiment = ExperimentsVisor(CONTEXT, image_idx='Strange_Tales_172005.jpg')
tesseract_experiment.display()


VBox(children=(HTML(value="<style id='stl-139619740843744'>\n    .wrapper-spinner {\n        overflow: hidden;…

# -
----


In [None]:
if SERVER is not None:
    SERVER.stop()
    SERVER = None
    os.environ['USE_TUNNEL'] = 'False'




Thread did not terminate, proceeding with forceful shutdown.
