In [2]:
#| hide
# %reload_ext autoreload
# %autoreload 0


# install (Colab)

In [3]:
# try: 
#     import fastcore as FC
# except ImportError: 
#     !pip install -q fastcore
# try:
#     import rich
# except ImportError:
#     !pip install -q rich


In [4]:
# !pip uninstall Pillow

In [5]:
# !pip install "pillow<10.1.0,>=8.3.2"

> **Note: we're using the `testbed` branch of PanelCleaner.**

In [6]:
# !pip install -q git+https://github.com/civvic/PanelCleaner.git@testbed-colab

# Test ngrok and web server
> OCRExperimentContext with web sever for Colab.


# Prologue

In [7]:
import os
from pathlib import Path
from typing import cast

import fastcore.all as FC
import fastcore.xtras  # patch `Path` with some utils (like `ls()` to list folder contents)
import ipywidgets as W
from fastcore.test import *  # type: ignore
from IPython.display import display
from IPython.display import HTML
from PIL import Image
from rich.console import Console

import pcleaner._testbed.testbed.web_server as web_server
from pcleaner._testbed.testbed.experiments import CropMethod
from pcleaner._testbed.testbed.experiments import OCRExperimentContext
from pcleaner._testbed.testbed.experiments import ExperimentsVisor



# Helpers

In [7]:
# pretty print by default
# %load_ext rich

In [8]:
console = Console(width=104, tab_size=4, force_jupyter=True)
cprint = console.print

## USE_PIL

The experiments can generate hundreds of images, and maintaining the **PIL** images in memory is not efficient. All the generated images are cached and visualized on demand through a URL pointing to the local cache. This approach prevents the kernel from being overloaded with **PIL** images, with the front-end responsible for fetching the image and the backend web server (not the kernel) for serving the image in another process. This method is quick and efficient. As an added bonus, the saved notebook remains lean and fit; it doesn't store the Base64 versions of all the output cell images.

Unfortunately, this approach does not work as is in **Colab**. Google Colab runs on an older Ubuntu 18.04 VM, so all the usual networking challenges with Docker, or whatever VMs Google is using, apply. Google also goes to great lengths to avoid exposing its internal architecture. We have two options:
- Let the Jupyter kernel serve the images itself, which is slow and memory-consuming.
- Use a tunnel to map localhost (server) to whatever IP and port the front-end (the browser you're currently using) is running on. We can use **ngrok** for this, but *ngrok* is a commercial service that has been abused and now requires confirmation the first time the tunnel connects, which can be inconvenient for the user. It also requires the user to open a free account and obtain an auth token.

You choose.

If the notebook is running in Colab and ngrok has been successfully installed and the tunnel has been created, the default setting is USE_PIL=False. You can set the environment variable USE_PIL=True to force the use of PIL images, but note that in certain circumstances, Colab will complain because the free tiers are usually memory constrained.


In [9]:
os.environ['USE_PIL'] = 'False'
os.environ['USE_TUNNEL'] = 'False'
SERVER = None

----
# EXP_DIR

In [10]:
EXP_DIR = OCRExperimentContext.EXP_DIR
EXP_DIR


Path('../experiment')

# Setup ngrok (Colab)
> Mostly for Colab, but can be forced for local development setting `os.environ['USE_TUNNEL'] = 'True'`.


In [11]:
os.environ['USE_TUNNEL'] = 'True'


In [12]:
if (os.environ['USE_PIL'].lower() == 'false') and os.environ['USE_TUNNEL'].lower() == 'true':
    SERVER = web_server.setup_ngrok(web_server.WebServerBottle, Path(EXP_DIR))


Bottle v0.13-dev server starting up (using WSGIRefServer())...
Listening on http://localhost:56636/
Hit Ctrl-C to quit.



127.0.0.1 - - [21/May/2024 15:25:42] "GET /images/pcleaner.png HTTP/1.1" 200 17709
127.0.0.1 - - [21/May/2024 15:25:51] "GET /images/cache/Strange_Tales_172005/.crop/Strange_Tales_172005_0_Default.png HTTP/1.1" 200 137784
127.0.0.1 - - [21/May/2024 15:26:06] "GET /images/cache/Strange_Tales_172005/.crop/Strange_Tales_172005_0_Initial%20box.png HTTP/1.1" 200 132011
127.0.0.1 - - [21/May/2024 15:26:30] "GET /images/cache/Strange_Tales_172005/.crop/Strange_Tales_172005_1_Initial%20box.png HTTP/1.1" 200 102223
127.0.0.1 - - [21/May/2024 15:26:33] "GET /images/cache/Strange_Tales_172005/.crop/Strange_Tales_172005_2_Initial%20box.png HTTP/1.1" 200 41398
127.0.0.1 - - [21/May/2024 15:26:47] "GET /images/cache/Strange_Tales_172005/.crop/Strange_Tales_172005_3_Initial%20box.png HTTP/1.1" 200 21597
t=2024-05-21T15:26:47+0200 lvl=warn msg="failed to open private leg" id=17810d8962eb privaddr=localhost:56636 err="dial tcp 127.0.0.1:56636: connect: connection reset by peer"
127.0.0.1 - - [21/May/20

In [13]:
CONTEXT = OCRExperimentContext('Tesseract', server=SERVER)
cprint(
    f"{CONTEXT.force_PIL=}, "
    f"{CONTEXT.use_tunnel=}, "
    f"{CONTEXT.server=}")



In [14]:
context: OCRExperimentContext
if SERVER is not None:
    assert SERVER.unc_share is not None and SERVER.running
    context = OCRExperimentContext('Tesseract')
    result = context.result('Tesseract-crop-post', 20, 0, CropMethod.DEFAULT, False)
    public_url: Path = SERVER.unc_share
    img_path = 'cache/Strange_Tales_172005/.crop/Strange_Tales_172005_0_Default.png'
    cprint(f'<img src="{public_url}/{img_path}"/>')
    display(HTML(f'<img src="{public_url}/{img_path}"/>'))


We'll use this little helper function throughout the experiments to get the local path, a server url or a PIL image.


In [15]:
_BCK = CONTEXT.force_PIL, CONTEXT.use_tunnel
DI = CONTEXT.display_image

p = CONTEXT.path_from_idx('Adolf_Cap_01_008.jpg')
cprint(f"{p=}")

CONTEXT.force_PIL, CONTEXT.use_tunnel = False, False
try: 
    test_eq(DI(p),  CONTEXT.final(p))
    cprint(f"force_PIL: {CONTEXT.force_PIL}, use_tunnel: {CONTEXT.use_tunnel} => DI: {DI(p)!r}")
except Exception as e: cprint('Error:', e)

CONTEXT.force_PIL, CONTEXT.use_tunnel = False, True
try: 
    assert SERVER and SERVER.unc_share
    prefix = SERVER.unc_share if SERVER else ''
    test_eq(DI(p),  prefix/p)
    cprint(f"force_PIL: {CONTEXT.force_PIL}, use_tunnel: {CONTEXT.use_tunnel} => DI: {DI(p)!r}")
except Exception as e: cprint('Error:', e)

CONTEXT.force_PIL, CONTEXT.use_tunnel = True, False
try: 
    test_eq(isinstance(DI(p), Image.Image), True)
    cprint(f"force_PIL: {CONTEXT.force_PIL}, use_tunnel: {CONTEXT.use_tunnel} => DI: {type(DI(p))}")
except Exception as e: cprint('Error:', e)

CONTEXT.force_PIL, CONTEXT.use_tunnel = True, True
try: 
    test_eq(isinstance(DI(p), Image.Image), True)
    cprint(f"force_PIL: {CONTEXT.force_PIL}, use_tunnel: {CONTEXT.use_tunnel} => DI: {type(DI(p))}")
except Exception as e: cprint('Error:', e)

CONTEXT.force_PIL, CONTEXT.use_tunnel = _BCK


In [16]:
cprint(f"force_PIL: {CONTEXT.force_PIL}, use_tunnel: {CONTEXT.use_tunnel}")

result_img = CONTEXT.cache_dir / 'Strange_Tales_172005/.crop/Strange_Tales_172005_0_Default.png'
cprint(f"{result_img=}")

result_final_img = CONTEXT.final(result_img)
cprint(f"{result_final_img = }")

display_image = CONTEXT.DI(result_img)
cprint(f"{display_image = }") if not CONTEXT.force_PIL else cprint(f"{type(display_image) = }")
display(HTML(f'<img src="{display_image}"/>'))


# EEAaO


## Visualize all

In [17]:
tesseract_experiment = ExperimentsVisor(CONTEXT, image_idx='Strange_Tales_172005.jpg')
tesseract_experiment.display()


VBox(children=(HBox(children=(HBox(children=(HBox(children=(Dropdown(layout=Layout(width='fit-content'), optio…

Output()

# -
----


In [18]:
if SERVER is not None:
    SERVER.stop()
    SERVER = None
    os.environ['USE_TUNNEL'] = 'False'


Thread did not terminate, proceeding with forceful shutdown.
