# install (Colab)

In [1]:
# try: 
#     import fastcore as FC
# except ImportError: 
#     !pip install -q fastcore
# try:
#     import rich
# except ImportError:
#     !pip install -q rich


In [2]:
# !pip install -q git+https://github.com/civvic/PanelCleaner.git@testbed

# Testing `Tesseract` OCR for Comics
> Accuracy Enhancements for OCR in `PanelCleaner`


# Prologue

In [3]:
from __future__ import annotations

from pathlib import Path
from typing import cast

import pcleaner.config as cfg
import torch
from rich.console import Console


In [5]:
from experiments import *
from helpers import *
from ocr_metric import *


In [5]:
import fastcore.xtras  # patch Path with some utils
from fastcore.test import *  # type: ignore


# Helpers

In [6]:
# pretty print by default
# %load_ext rich

In [7]:
#| exporti
console = Console(width=104, tab_size=4, force_jupyter=True)
cprint = console.print


## Tesseract installation

In [8]:
out = !tesseract --version
out


['tesseract 5.3.4',
 ' leptonica-1.84.1',
 '  libgif 5.2.1 : libjpeg 8d (libjpeg-turbo 3.0.0) : libpng 1.6.43 : libtiff 4.6.0 : zlib 1.2.11 : libwebp 1.4.0 : libopenjp2 2.5.2',
 ' Found NEON',
 ' Found libarchive 3.7.2 zlib/1.2.11 liblzma/5.4.6 bz2lib/1.0.8 liblz4/1.9.4 libzstd/1.5.6',
 ' Found libcurl/8.4.0 SecureTransport (LibreSSL/3.3.6) zlib/1.2.11 nghttp2/1.51.0']

###  Install jpn_vert tesserac lang



```bash
cd model
sudo ln -s jpn_vert_tessdata_best.traineddata /usr/share/tesseract-ocr/5/tessdata/jpn_vert.traineddata
```

In [9]:
out = !tesseract --list-langs
tessdata = Path(out[0].split('"')[1])
tessdata, [', '.join(sub) for sub in [out[i:i + 15] for i in range(1, len(out), 15)]]


(Path('/opt/homebrew/share/tessdata'),
 ['afr, amh, ara, asm, aze, aze_cyrl, bel, ben, bod, bos, bre, bul, cat, ceb, ces',
  'chi_sim, chi_sim_vert, chi_tra, chi_tra_vert, chr, cos, cym, dan, deu, div, dzo, ell, eng, enm, epo',
  'equ, est, eus, fao, fas, fil, fin, fra, frk, frm, fry, gla, gle, glg, grc',
  'guj, hat, heb, hin, hrv, hun, hye, iku, ind, isl, ita, ita_old, jav, jpn, jpn_vert',
  'kan, kat, kat_old, kaz, khm, kir, kmr, kor, kor_vert, lao, lat, lav, lit, ltz, mal',
  'mar, mkd, mlt, mon, mri, msa, mya, nep, nld, nor, oci, ori, osd, pan, pol',
  'por, pus, que, ron, rus, san, script/Arabic, script/Armenian, script/Bengali, script/Canadian_Aboriginal, script/Cherokee, script/Cyrillic, script/Devanagari, script/Ethiopic, script/Fraktur',
  'script/Georgian, script/Greek, script/Gujarati, script/Gurmukhi, script/HanS, script/HanS_vert, script/HanT, script/HanT_vert, script/Hangul, script/Hangul_vert, script/Hebrew, script/Japanese, script/Japanese_vert, script/Kannada, script/

In [10]:
langs = tessdata.ls()
cprint([p.resolve() for p in langs if 'eng' in p.name] + [p.resolve() for p in langs if 'jpn' in p.name])



----
# Tesseract experiments

# PanelCleaner Configuration
> Adapt `PanelCleaner` `Config` current config to this notebook.


In [11]:
config = cfg.load_config()
config.cache_dir = Path(".")

cache_dir = config.get_cleaner_cache_dir()

profile = config.current_profile
preprocessor_conf = profile.preprocessor
# Modify the profile to OCR all boxes.
# Make sure OCR is enabled.
preprocessor_conf.ocr_enabled = True
# Make sure the max size is infinite, so no boxes are skipped in the OCR process.
preprocessor_conf.ocr_max_size = 10**10
# Make sure the sus box min size is infinite, so all boxes with "unknown" language are skipped.
preprocessor_conf.suspicious_box_min_size = 10**10
# Set the OCR blacklist pattern to match everything, so all text gets reported in the analytics.
preprocessor_conf.ocr_blacklist_pattern = ".*"


# Test images
> `IMAGE_PATHS` is a list of image file paths that are used as input for testing the OCR methods.

In [12]:
media_path = Path("media/")

IMAGE_PATHS = sorted(
    [_ for _ in media_path.glob("*") if _.is_file() and _.suffix.lower() in [".jpg", ".png", ".jpeg"]])

[f"{i:02}: {_.name}" for i,_ in enumerate(IMAGE_PATHS)]


['00: Action_Comics_1960-01-00_(262).JPG',
 '01: Adolf_Cap_01_008.jpg',
 '02: Barnaby_v1-028.png',
 '03: Barnaby_v1-029.png',
 '04: Buck_Danny_-_12_-_Avions_Sans_Pilotes_-_013.jpg',
 '05: Cannon-292.jpg',
 '06: Contrato_con_Dios_028.jpg',
 '07: Erase_una_vez_en_Francia_02_88.jpg',
 '08: FOX_CHILLINTALES_T17_012.jpg',
 '09: Furari_-_Jiro_Taniguchi_selma_056.jpg',
 '10: Galactus_12.jpg',
 '11: INOUE_KYOUMEN_002.png',
 '12: MCCALL_ROBINHOOD_T31_010.jpg',
 '13: MCCAY_LITTLENEMO_090.jpg',
 '14: Mary_Perkins_On_Stage_v2006_1_-_P00068.jpg',
 '15: PIKE_BOYLOVEGIRLS_T41_012.jpg',
 '16: Sal_Buscema_Spaceknights_&_Superheroes_Ocular_Edition_1_1.png',
 '17: Sal_Buscema_Spaceknights_&_Superheroes_Ocular_Edition_1_1_K.png',
 '18: Sal_Buscema_Spaceknights_&_Superheroes_Ocular_Edition_1_2.png',
 '19: Spirou_Et_Fantasio_Integrale_06_1958_1959_0025_0024.jpg',
 '20: Strange_Tales_172005.jpg',
 '21: Strange_Tales_172021.jpg',
 '22: Tarzan_014-21.JPG',
 '23: Tintin_21_Les_Bijoux_de_la_Castafiore_page_39.jp

# CONTEXT
> `CONTEXT` is an `OCRExperimentContext` object that contains the configuration and the list of image paths.


You can get the configuration with `OCRExperimentContext.get_config()`.


In [13]:
CONTEXT = OCRExperimentContext(None, IMAGE_PATHS)

gpu = torch.cuda.is_available() or torch.backends.mps.is_available()
model_path = CONTEXT.config.get_model_path(gpu)
DEVICE = ("mps" if torch.backends.mps.is_available() else "cuda") if model_path.suffix == ".pt" else "cpu"

CONTEXT.config.show()
cprint(
    f"{'cache_dir':>15}: {repr(cache_dir)}\n"
    f"{'model_path':>15}: {repr(model_path)}\n"
    f"{'device':>15}: {repr(DEVICE)}")


Current Configuration:

Locale: System default
Default Profile: Built-in
Saved Profiles:
- victess: /Users/vic/dev/repo/DL-mac/cleaned/victess.conf
- vicmang: /Users/vic/dev/repo/DL-mac/cleaned/vicmang.conf

Profile Editor: cursor
Cache Directory: .
Default Torch Model Path: /Users/vic/Library/Caches/pcleaner/model/comictextdetector.pt
Default CV2 Model Path: /Users/vic/Library/Caches/pcleaner/model/comictextdetector.pt.onnx
GUI Theme: System default

--------------------

Config file located at: /Users/vic/Library/Application Support/pcleaner/pcleanerconfig.ini
System default cache directory: /Users/vic/Library/Caches/pcleaner


# Base image
> Change `BASE_IMAGE_IDX` to select a different base image to use in the examples below.

In [14]:
BASE_IMAGE_IDX: ImgIdT = cast(ImgIdT, CONTEXT.normalize_idx("Strange_Tales_172005.jpg"))
# BASE_IMAGE_IDX = CONTEXT.normalize_idx("0033")
# BASE_IMAGE_IDX = CONTEXT.normalize_idx("INOUE_KYOUMEN_002")
# BASE_IMAGE_IDX = CONTEXT.normalize_idx("Action_Comics_1960-01-00_(262)")

assert BASE_IMAGE_IDX is not None
img_path = Path(CONTEXT.image_paths[BASE_IMAGE_IDX])
assert img_path.exists()


# Empty cache
> Clear the image cache used profusely throughout the examples below.


You will be warned before the cache is emptied.

In [15]:
# CONTEXT.empty_cache_warn()

In [16]:
# CONTEXT.empty_cache_warn(BASE_IMAGE_IDX)

# Base image
> Change `BASE_IMAGE_IDX` to select a different base image to use in the examples below.


In [17]:
BASE_IMAGE_IDX: ImgIdT = cast(ImgIdT, CONTEXT.normalize_idx("Strange_Tales_172005.jpg"))
assert CONTEXT.path_from_idx(BASE_IMAGE_IDX).exists()


# Visualize images


In [18]:
img_visor = ImageContextVisor(CONTEXT, BASE_IMAGE_IDX)
img_visor


Output(layout=Layout(height='0px'))

HBox(children=(HBox(children=(Dropdown(index=20, layout=Layout(width='fit-content'), options={'Action_Comics_1…

Output()

# Tesseract experiments


In [19]:
# tesseract_experiment = ExperimentsVisor(CONTEXT)
tesseract_experiment = ExperimentsVisor(CONTEXT, BASE_IMAGE_IDX)

test_eq(tesseract_experiment.all_values, {
    'image_selector': {'image_idx': 20},
    'content_selector': {'display_option': DisplayOptions.RESULTS},
    'result_visor': {
        'all_boxes': False,
        'box_idx': 0,
        'all_methods': False,
        'method': CropMethod.INITIAL_BOX,
    },
    'model_selector': {'model': OCRModel.TESSERACT},
    'self': {}
})

tesseract_experiment


HBox(children=(HBox(children=(HBox(children=(Dropdown(layout=Layout(width='fit-content'), options={'Tesseract'…

Output()