# Dataset preparation

This is a small python script to adjust CocoTextv2 data to the style and lexicon of MJSynth. The test split of the dataset will be separated into test and validation split. Note that words not available in the lexicon will be excluded by the preparation! The script creates a cutout of the annotated bbox and saves it in a folder to prepare the data for recognition. This can take some time to execute.

In [7]:
import os
import coco_text
import regex as re
import random
random.seed(42)
from PIL import Image, ImageDraw
import numpy as np
ct = coco_text.COCO_Text('cocotext.v2.json') # Notice that this file is required: https://bgshih.github.io/cocotext/#h2-download
ct.info()

loading annotations into memory...
0:00:02.069921
creating index...
index created!


## Set user paths: 

In [8]:
# Lexicon
path_to_lexicon = r"C:\Users\tom\Documents\JKU\PracticalWorkOCR\PytorchCRNN\crnn-pytorch\data\lexicon.txt"

# Dataset
dataset_directory = r'C://Users//tom//Documents//JKU//PracticalWorkOCR//DATASET12GB//'
dataType = 'train2014'
# Where to store the cropped data:
savepath = r'C://Users//tom//Documents//JKU//PracticalWorkOCR//PytorchCRNN//crnn-pytorch//data'

# Save annotation file to:
path_annotation_val = r'C://Users//tom//Documents//JKU//PracticalWorkOCR//PytorchCRNN//crnn-pytorch//data//coco_annotations_val.txt'
path_annotation_test = r'C://Users//tom//Documents//JKU//PracticalWorkOCR//PytorchCRNN//crnn-pytorch//data//coco_annotations_test.txt'
path_annotation_train = r'C://Users//tom//Documents//JKU//PracticalWorkOCR//PytorchCRNN//crnn-pytorch//data//coco_annotations_test.txt'

In [9]:
with open(path_to_lexicon, "r") as f:
    f = f.read()
    lexicon = f.split("\n")

## Val/Test Set

In [12]:
with open(path_annotation_val, 'w') as af:
    af.write("")
with open(path_annotation_test, 'w') as af:
    af.write("")
imgIds = ct.getImgIds(imgIds=ct.val, catIds=[('legibility','legible')])
imgcount1 = 0
imgcount2 = 0
for i in imgIds:
    annIds = ct.getAnnIds(imgIds=i)
    anns = ct.loadAnns(annIds)
    for j in anns:
        if j['legibility'] == 'legible' and j['utf8_string']:
            j_string = re.sub('[^A-Za-z0-9]+', '', j['utf8_string'].lower())
            if j_string == "":
                j_string = "specialtoken"
            try:
                found_at = lexicon.index(j_string)
                if bool(random.getrandbits(1)):
                    with open(path_annotation_val, 'a') as af:
                        af.write(f'./cropped/{str(i)}_{str(j_string)}_{str(found_at)}.jpg {str(found_at)}\n')
                        imgcount1 += 1
                else:
                    with open(path_annotation_test, 'a') as af:
                        af.write(f'./cropped/{str(i)}_{str(j_string)}_{str(found_at)}.jpg {str(found_at)}\n')
                        imgcount2 += 1
                img = ct.loadImgs(i)[0]
                rimg = Image.open(os.path.join(dataset_directory, 'images/%s/%s'%(dataType,img['file_name'])))
                if True: #mask images
                    mask = Image.new("L", rimg.size, 0)
                    im2 = Image.new(mode="RGB", size=rimg.size)
                    draw = ImageDraw.Draw(mask)
                    draw.polygon(j['mask'], fill=255)
                    im = Image.composite(rimg, im2, mask)
                im_crop = im.crop((j['bbox'][0], j['bbox'][1], j['bbox'][0] + j['bbox'][2], j['bbox'][1] + j['bbox'][3]))
                im_crop.save(f'{savepath}//cropped//{str(i)}_{str(j_string)}_{str(found_at)}.jpg')
                
            except ValueError:
                continue
print(f"Val: {imgcount1}")
print(f"Test: {imgcount2}")

Val: 4727
Test: 4642


## Train Set

In [13]:
with open(path_annotation_train, 'w') as af:
    af.write("")
imgIds = ct.getImgIds(imgIds=ct.train, catIds=[('legibility','legible')])
imgcount1 = 0
for i in imgIds:
    annIds = ct.getAnnIds(imgIds=i)
    anns = ct.loadAnns(annIds)
    for j in anns:
        if j['legibility'] == 'legible' and j['utf8_string']:
            j_string = re.sub('[^A-Za-z0-9]+', '', j['utf8_string'].lower())
            if j_string == "":
                j_string = "specialtoken"
            try:
                found_at = lexicon.index(j_string)
                with open(path_annotation_train, 'a') as af:
                    af.write(f'./cropped/{str(i)}_{str(j_string)}_{str(found_at)}.jpg {str(found_at)}\n')
                    imgcount1 += 1
                img = ct.loadImgs(i)[0]
                rimg = Image.open(os.path.join(dataset_directory, 'images/%s/%s'%(dataType,img['file_name'])))
                if True: #mask images
                    mask = Image.new("L", rimg.size, 0)
                    im2 = Image.new(mode="RGB", size=rimg.size)
                    draw = ImageDraw.Draw(mask)
                    draw.polygon(j['mask'], fill=255)
                    im = Image.composite(rimg, im2, mask)
                im_crop = im.crop((j['bbox'][0], j['bbox'][1], j['bbox'][0] + j['bbox'][2], j['bbox'][1] + j['bbox'][3]))
                im_crop.save(f'{savepath}//cropped//{str(i)}_{str(j_string)}_{str(found_at)}.jpg')
            except ValueError:
                continue
print(f"Train: {imgcount1}")

Train: 42313
