### Imports

In [46]:
import h5py
from matplotlib import pyplot as plt
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import math
from PIL import Image

%matplotlib inline

print("Test")

Test


## Load data for later use

In [2]:
FONTS = ['Skylark', 'Ubuntu Mono', 'Sweet Puppy']

file_name = "font_recognition_train_set/SynthText.h5"

db = h5py.File(file_name, "r")
IM_NAMES = list(db["data"].keys())

num_of_images = len(im_names)
print(f"Number of images: {num_of_images}")

Number of images: 760


## Crop the word with perfect angle - affine transformation

Tutorial that helped me:

https://docs.opencv.org/3.4/d4/d61/tutorial_warp_affine.html

In [3]:
def crop_affine(img, bb):
    """
    Crop image using affine transformation, around bounding box. Returns cropped image.
    """
    img_copy = img.copy()
    width = img_copy.shape[1]
    height = img_copy.shape[0]
    
    point1 = (bb[0][0], bb[1][0]) # Top-left
    point2 = (bb[0][1], bb[1][1]) # Top-right
    point3 = (bb[0][2], bb[1][2]) # Bottom-Right
    point4 = (bb[0][3], bb[1][3]) # Bottom-Left
    
    #mapping srcPoints (list of points of size 3) to dstPoints (list of points of size 3)
    srcTri = np.array( [point1, point2, point4] ).astype(np.float32)
    dstTri = np.array( [[0, 0], [width, 0], [0, height]] ).astype(np.float32)
    
    warp_mat = cv2.getAffineTransform(srcTri, dstTri)
    warp_dst = cv2.warpAffine(img_copy, warp_mat, (width, height))
    
    return warp_dst

## Normalize function

![](images/normalize_formula.png)

In [4]:
def normalize(img, low=0, high=1):
    """
    Normalize image to range [low, high] from any range. Note: fast algorithm.
    """
    return np.interp(img, [np.min(img), np.max(img)], [low, high])

In [42]:
def extract_data(img_name):
    """
    Process the image and returned processed result.
    Return a json in the following structure (as an example):
    
    {
        "name": "test.png",
        "words": [
            {
                "word": "the",
                "font": "Ubuntu Mono",
                "chars": [
                    {
                        "char": "t",
                        "font": "Ubuntu Mono",
                        "crop": <ndarray>
                    }, ...
                ]
            },
            {
                "word": "shlomi",
                "font": "Skylark",
                "chars": [
                    {
                        "char": "s",
                        "font": "Skylark",
                        "crop": <ndarray>
                    }, ...
                ]
            }, ...
        ]
    }
    """
    img = db['data'][img_name][:]
    font = db['data'][img_name].attrs['font']
    txt = db['data'][img_name].attrs['txt']
    charBB = db['data'][img_name].attrs['charBB'] # Contains list of bb for words.
    wordBB = db['data'][img_name].attrs['wordBB'] # Contain list of bb for chars.


    words = []
    char_index_accumulator = 0
    word_index = 0 # Counter
    
    # Process word
    for word in txt:
        word_font = font[char_index_accumulator].decode() # Convert bytes to string
        chars = []
        word_bb = wordBB[:, :, word_index]
        
        # Process chars
        for char_index in range(len(word)):
            char = chr(word[char_index])
            char_font = font[char_index_accumulator].decode()
            char_bb = charBB[:, :, char_index_accumulator]
            
            assert char_font == word_font # Double check that the pre-processed image is indeed 1 font per word, and each char is same font as word.
            
            crop_char = crop_affine(img, char_bb)
            
            chars.append({
                "char": char,
                "font": char_font,
                "crop": crop_char
            })
            
            char_index_accumulator += 1

        words.append({
            "word": word.decode(),
            "font": word_font,
            "chars": chars
        })
        word_index += 1
    
    # Return result
    return {
        "name": img_name,
        "words": words,
    }


In [64]:
%%time
x_train = [] #Images
y_train = [] #Labels
i = 0
for img_name in im_names:
    res = extract_data(img_name)
    for word in res["words"]:
        for char in word["chars"]:
            char_font = char["font"]
            char_crop = char["crop"]
            x_train.append(char_crop)
            y_train.append(char_font)
            if i == 10:
                break
            i += 1

x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size=0.2, random_state=12345)



Wall time: 17.1 s


In [65]:
len(x_validate)

598