In [5]:
from IPython.display import JSON
from pathlib import PosixPath
from tqdm import tqdm_notebook

import PIL.ImageDraw
from fastai.vision import *

INPUT = PosixPath("../0bags-crop-sauvola/")
IMGS_SCALED = PosixPath("./scaled")
MASKS_SCALED = PosixPath("./masks_scaled/")
SCALE_FACTOR = 4

for d in [IMGS_SCALED, MASKS_SCALED]:
    os.makedirs(d, exist_ok=True)

## Look to input files

In [6]:
INPUT.ls()[:5]

[PosixPath('../0bags-crop-sauvola/fontane_brandenburg02_1863_OCR-D-IMG-CROP2_0019.json'),
 PosixPath('../0bags-crop-sauvola/ranke_paepste03_1836_OCR-D-IMG-CROP2_0018.png'),
 PosixPath('../0bags-crop-sauvola/ruempler_gartenbau_1882_OCR-D-IMG-CROP2_0014.json'),
 PosixPath('../0bags-crop-sauvola/steinen_naturvoelker_1894_OCR-D-IMG-CROP2_0005.png'),
 PosixPath('../0bags-crop-sauvola/goerres_volksbuecher_1807_OCR-D-IMG-CROP2_0012.json')]

In [7]:
imgfiles = sorted(f.relative_to(INPUT) for f in INPUT.ls() if re.match(r'[^.]+\.png$', str(f.relative_to(INPUT))))
binfiles = sorted(f.relative_to(INPUT) for f in INPUT.ls() if re.match(r'[^.]+\.bin\.png$', str(f.relative_to(INPUT))))
annfiles = sorted(f.relative_to(INPUT) for f in INPUT.ls() if re.match(r'[^.]+\.json$', str(f.relative_to(INPUT))))
assert len(imgfiles) == len(binfiles)
assert len(imgfiles) == len(annfiles)
pd.set_option('max_colwidth', 80)
df = pd.DataFrame({'img': imgfiles, 'bin': binfiles, 'ann': annfiles})
df.head()

Unnamed: 0,img,bin,ann
0,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0002.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0002.bin.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0002.json
1,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0003.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0003.bin.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0003.json
2,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0004.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0004.bin.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0004.json
3,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0001.png,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0001.bin.png,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0001.json
4,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0002.png,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0002.bin.png,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0002.json


In [8]:
df["ann_json"] = df.ann.apply(lambda f: json.load(open(INPUT / f, "r")))
df.head()

Unnamed: 0,img,bin,ann,ann_json
0,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0002.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0002.bin.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0002.json,"{'angle': None, 'regions': [{'coords': [[890, 245], [890, 62], [504, 62], [5..."
1,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0003.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0003.bin.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0003.json,"{'angle': None, 'regions': [{'coords': [[1110, 1661], [1110, 1223], [566, 12..."
2,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0004.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0004.bin.png,arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0004.json,"{'angle': None, 'regions': [{'coords': [[1065, 1116], [1065, 922], [319, 922..."
3,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0001.png,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0001.bin.png,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0001.json,"{'regions': [{'type': 'text', 'coords': [[574, 126], [574, 52], [449, 52], [..."
4,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0002.png,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0002.bin.png,arnim_wunderhorn03_1808_OCR-D-IMG-CROP2_0002.json,"{'regions': [{'type': 'graphic', 'coords': [[1282, 2253], [1282, 11], [0, 11..."


In [9]:
ann_json_example = json.load(open(INPUT / annfiles[0], "r"))
ann_json_example

{'angle': None,
 'regions': [{'coords': [[890, 245], [890, 62], [504, 62], [504, 245]],
   'type': 'graphic'},
  {'coords': [[1397, 755], [1397, 298], [104, 298], [104, 755]],
   'type': 'text'},
  {'coords': [[1391, 1288], [1391, 801], [94, 801], [94, 1288]],
   'type': 'text'},
  {'coords': [[362, 1528], [362, 1775], [1036, 1775], [1036, 1528]],
   'type': 'text'},
  {'coords': [[1184, 2281], [1184, 2031], [274, 2031], [274, 2281]],
   'type': 'text'},
  {'coords': [[870, 2454], [870, 2291], [640, 2291], [640, 2454]],
   'type': 'noise'},
  {'coords': [[831, 1939], [831, 1845], [584, 1845], [584, 1939]],
   'type': 'noise'},
  {'coords': [[924, 2019], [924, 1958], [434, 1958], [434, 2019]],
   'type': 'separator'},
  {'coords': [[794, 1477], [794, 1404], [618, 1404], [618, 1477]],
   'type': 'separator'}]}

In [10]:
segtypes = set()
for aj in df.ann_json.to_list():
    for region in aj.get("regions", []):
        segtypes.add(region["type"])
segtypes = dict((v, k) for k, v in enumerate(sorted(segtypes), start=1))
segtypes["void"] = 0
segtypes

{'graphic': 1,
 'maths': 2,
 'noise': 3,
 'separator': 4,
 'table': 5,
 'text': 6,
 'void': 0}

In [11]:
PathLike = Union[str, PosixPath]

def pathify(p: PathLike) -> PosixPath:
    return PosixPath(p) if not type(p) is PosixPath else p 

In [12]:
def resize_image_(imgpath: PosixPath, orig_folder: PathLike = INPUT, dest_folder: PathLike = IMGS_SCALED, scale = SCALE_FACTOR):
    img = open_image(orig_folder / imgpath)
    img.resize((img.shape[0], img.shape[1] // scale, img.shape[2] // scale))
    img.save(dest_folder / imgpath)


In [13]:
for f in tqdm_notebook(imgfiles):
    resize_image_(f)

HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))




In [14]:
def regions(imgfile: PathLike) -> List[Dict]:
    imgfile = pathify(imgfile)
    annfile = INPUT / re.sub('\.png$', '.json', imgfile.name)
    ann = json.load(open(annfile, "r"))
    return ann.get("regions", [])

def create_mask_(
    imgfile: PathLike,
    classes: Dict[str, int], 
    img_folder: PathLike = INPUT, 
    mask_folder: PathLike = MASKS_SCALED,
    # border: str = "ArtificialBorder", border_width: int = 5,  # XXX(js): no idea about a good border_width
    downscale: int = SCALE_FACTOR,
):
    imgfile = pathify(imgfile)
    res = PIL.Image.open(img_folder / imgfile.name).size
    img = PIL.Image.new(
        mode='L',  # only one 8bit channel (we'll encode the segmentation classes each as one byte with a different nr for each class)
        size=(res[0] // downscale, res[1] // downscale), 
        color=0
    )
    regs = regions(imgfile)
    for r in regs:
        assert "coords" in r
        assert "type" in r
        coords = [(c[0] // downscale, c[1] // downscale) for c in r["coords"]]
        PIL.ImageDraw.Draw(img).polygon(coords, fill=classes[r["type"]])
        
    # Also draw artificial borders but after the filled polygon to not get hidden by any overlapping stuff
    # for r in regs:
    #    coords = r["coords"]
    #    coords += [coords[0], coords[1]]  # necessary to close the lining (it's not autoclosed like for polygonals)
    #    PIL.ImageDraw.Draw(img).line(coords, fill=classes[border], width=border_width)
    
    assert np.max(list(img.getdata())) <= np.max(list(classes.values()))
    img.save(mask_folder / imgfile.name)

In [15]:
for f in tqdm_notebook(imgfiles):
    create_mask_(f, segtypes)

HBox(children=(IntProgress(value=0, max=1051), HTML(value='')))




In [None]:
RESNET_SIZE = (224, 224)  # that's what resnet is trained for
# print("Normalize all image with resize to", RESNET_SIZE)
def get_y_fn(imgfile: PathLike) -> PosixPath:
    return MASKS_SCALED / imgfile.name

def valid_by_book(imgfile: PosixPath, split_pct: float = 0.2) -> bool:
    """ Returns same result for all pages inside a book (given they are in the same folder)"""
    book_name = imgfile.name.split("OCR")[0]  # anything before OCR in "arent_dichtercharaktere_1885_OCR-D-IMG-CROP2_0002.png" determines the book the page is from
    h = int(hashlib.md5(book_name.encode("utf-8")).hexdigest(), 16)  # little trick to calculate a platform independent hash on the name
    return (h % 1e6) / 1e6 < split_pct

def create_data(
    tfms: List[Transform] = None, 
    bs: int = 4,  # XXX(js): This batch size is a bit too small for batch normalization, but my GTX 1080 can't fit more atm
    sample_p: float = 1.0, split_pct: float = 0.2, split_by_book: bool = False,
    seed: int = None
) -> ImageDataBunch:
    if not tfms: tfms = []
    data= (SegmentationItemList
        .from_folder(IMGS_SCALED)
        .filter_by_rand(sample_p)
    )
    data = (
        data.split_by_valid_func(valid_by_book)
        if split_by_book
        else data.split_by_rand_pct(valid_pct=split_pct, seed=seed)
    )
    data = (data
        .label_from_func(get_y_fn, classes=list(segtypes.values()))
        .transform(tfms, size=RESNET_SIZE, tfm_y=True)
        .transform(tfms, tfm_y=True)
        .databunch(bs=bs)
        .normalize(imagenet_stats)
    )
    return data

data = create_data(seed=1, bs=1)
data

In [None]:
data.items

In [None]:
data.x[0]

In [None]:
data.y[0]

In [None]:
data.show_batch()

In [None]:
void_code = 0  # I fill image with zeros and we don't want to train this void information
def acc_page_seg(input, target):
    target = target.squeeze(1)
    mask = target != void_code
    return (input.argmax(dim=1)[mask]==target[mask]).float().mean()

In [None]:
tfms = []  # no transformations so far
data = create_data(tfms=tfms, split_by_book=True, seed=42, bs=1)  # XXX(js): tried several batch sizes and 8 seems to work good
data

In [None]:
learn = unet_learner(data, models.resnet34, metrics=acc_page_seg)

In [None]:
learn.fit_one_cycle(4)

In [None]:
learn.show_results()

In [None]:
learn.fit_one_cycle(20, max_lr=slice(1e-6, 1e-4))

In [None]:
learn.save("unet-epochs24")

In [None]:
learn.show_results()

In [None]:
learn.fit_one_cycle(25, max_lr=slice(1e-6, 1e-4))