In [83]:
import cv2
import numpy as np
import imutils

import os
from matplotlib import pyplot as plt
import typing as t
import math

Working folders and load images

In [84]:
working_p = "./"
in_p = working_p + "in/"
out_p = working_p + "out/"

# order number used for giving name to output images
ordn = 0

files = [f for f in os.listdir(in_p)]

imgs = []

Utility functions

In [85]:
def save_img(img, file: str, s) -> str:
    """
    Saves the image with the formated name: <out_path>/<file_name>[-<s>].<file_ext>
    Note: <out_path> is a global variable '/' terminated
    Note: By default extension is ".png"
    
    @param img: the image to save
    @param file: the name of the file to save

    @return: the name of the file saved
    """

    file_name = file
    file_ext = "png"
    if file.count(".") > 0:
        file_name = ".".join([x for x in file.split(".")[:-1]])
        file_ext = file.split(".")[-1]

    # if type of s is string, then it is the suffix
    # else if type of s is list, then add multiple suffizes
    # else add s as it is
    suffix = ""
    if type(s) is str:
        suffix = [s]
    elif  type(s) is list:
        suffix = s
    else:
        suffix = [str(s)]

    global ordn
    file_out = out_p + file_name + "-" + str(ordn) + "-"  + "-".join(suffix) + "." + file_ext
    ordn += 1

    cv2.imwrite(file_out, img)

    return file_out


Helper functions for bitmap detection

In [86]:
def do_fft(img):
    """
    It takes in a square gray image and processes it to get the frequency domain image.

    @param img: a square gray image

    @return: a tuple of the frequency domain image and the magnitude spectrum
    """
    f = np.fft.fft2(img)
    fshift= np.fft.fftshift(f)
    magnitude_spectrum = 20*np.log(np.abs(fshift))
    return (fshift, magnitude_spectrum)

def do_ifft(fshift):
    f_ishift = np.fft.ifftshift(fshift)
    img_back = np.fft.ifft2(f_ishift)
    img_back = np.abs(img_back)
    return img_back

def do_keep_disk(fshift, sr, br):
    """
    Keeps only the disk from fshift (frequency domain image). The disk is in the center of the image.

    @param fshift: the frequency domain image (of square shape)
    @param sr: the small radius of the disk to keep (in procents, maximum is 50% of the image size)
    @param br: the big radius of the disk to keep (in procents, maximum is 50% of the image size)
    
    @return: the frequency domain image with only the disk part kept
    """

    rows = img.shape[0]
    cols = img.shape[1]
    crow,ccol = int(rows/2) , int(cols/2)
    
    big_r = int(rows * br / 100)
    small_r = int(rows * sr / 100)
    med_r = abs(int(big_r - (big_r - small_r) / 2))

    empty_img = np.zeros((rows, cols), np.uint8)
    cv2.circle(empty_img, (ccol, crow), med_r, color=1, thickness=abs(big_r - small_r))

    fshift_modified = fshift * empty_img

    return fshift_modified

def get_bitmap_of_possible_text(img, sr: int, br: int, scale: int, file = "file.png", DEBUG: bool = False):
    """
    It thakes in a square gray image and processes it to hopefully remain with a bitmap of where text if present.
    The processing steps are:

    @param img: a square gray image
    @param sr: the small radius of the disk to keep (in procents, maximum is 50% of the image size)
    @param br: the big radius of the disk to keep (in procents, maximum is 50% of the image size)
    @param scale: how much to scale the remained frequencies after the disk is kept

    @return: a bitmap of where text is present
    """

    (fshift, magnitude_spectrum_original) = do_fft(img)

    # keep only disk from fshift
    fshift = do_keep_disk(fshift, sr, br)

    # increase the frequencies
    fshift *= scale

    # do inverse fft with the filtered frequencies
    img_filtered = do_ifft(fshift)
    img_filtered = cv2.convertScaleAbs(img_filtered)

    # use blackhat
    img_filtered = cv2.morphologyEx(img_filtered, cv2.MORPH_BLACKHAT, np.ones((3,3), np.uint8))

    # use OTSU global thresholding
    thresh, img_otsu = cv2.threshold(img_filtered, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    if DEBUG:
        # print("fshift -> " + file)
        # print(fshift)
        # print(fshift.shape)
        save_img(magnitude_spectrum_original, file, "magnitude_spectrum-original")
        save_img(20*np.log(np.abs(fshift)), file, "magnitude_spectrum-modified")
        save_img(img_filtered, file, "img_filtered")
        print(file + " OTSU T: " + str(thresh))
        save_img(img_otsu, file, "img_otsu")

    return img_otsu

def filter_for_text(bitmap, file = "file.png", DEBUG: bool = False):
    kernel = np.ones((3,3), np.uint8)
    result = bitmap

    # dilate
    result = cv2.dilate(result, kernel, iterations=1)

    cnts = cv2.findContours(result.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = imutils.grab_contours(cnts)

    color_img = cv2.cvtColor(result, cv2.COLOR_GRAY2BGR)

    for c in cnts:
        (x, y, w, h) = cv2.boundingRect(c)
        # draw bounding rectangle
        cv2.rectangle(color_img, (x, y), (x + w, y + h), (0, 255, 0), 2)

        # TO_DO: check if these hold on a bigger dataset
        if h < 25 or w < 10:
            result[y:y+h, x:x+w] = 0
        elif (w > img.shape[0] / 1.3):
            result[y:y+h, x:x+w] = 0
        elif (h / w > 10):
            result[y:y+h, x:x+w] = 0
        elif (h / w < 0.3):
            result[y:y+h, x:x+w] = 0

    # dilate once more
    result = cv2.dilate(result, kernel, iterations=4)

    cnts = cv2.findContours(result.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = imutils.grab_contours(cnts)
    after_filter = cv2.cvtColor(result, cv2.COLOR_GRAY2BGR)
    for c in cnts:
        (x, y, w, h) = cv2.boundingRect(c)
        # draw bounding rectangle
        cv2.rectangle(after_filter, (x, y), (x + w, y + h), (0, 255, 0), 2)


    if DEBUG:
        save_img(color_img, file, "bounding_rectangles")
        save_img(after_filter, file, "boxes_after_a_filter")
        save_img(result, file, "filtered_result")

    return result

def get_bitmap(gray, file="file.png", DEBUG: bool = False):
    """
    It takes in a square gray image and returns a bitmap of where text is present.

    @param gray: a gray image
    
    @return: a bitmap of where text is present
    """

    # get bitmap of where text is present
    bitmap_raw = get_bitmap_of_possible_text(gray, sr=7, br=18, scale=12, file=file, DEBUG=DEBUG)

    # filter bitmap
    bitmap_text = filter_for_text(bitmap_raw, file=file, DEBUG=DEBUG)

    return bitmap_text


Helper functions for image segmentation

In [101]:
def get_kernel(shape, lines, collumns) -> t.Tuple[int, int]:
    """
    It takes in a bitmap of where text is present and returns a square kernel to traverse the image.

    @param shape: shape of an image to be split in <lines> and <collums>
    @param lines: the number of lines to be split
    @param collumns: the number of collums to be split

    @return: a square kernel to traverse the image
    """
    height, width = shape

    lin_size = int(height / lines)
    col_size = int(width / collumns)

    kernel_size = min(lin_size, col_size)

    return (kernel_size, kernel_size)

def segment_image(img, kernel, overlap, file="file.png", DEBUG: bool = False):
    """
    It segments the image in smaller segments of size kernel that overlap by a percentage of overlap.

    @param img: the image to be segmented
    @param kernel: a square
    @param overlap: the percentage of overlap (example: 30%)

    @return: a list of segments 
    """
    
    kl = kernel[0] # kernel length
    os = int(kl * overlap / 100) # overlap size
    ns = kl - os # non-overlap size

    height, width = img.shape

    hsn = math.ceil((height - os) / ns) # number of segments on height
    wsn = math.ceil((width  - os) / ns) # number of segments on width

    corners = []

    for i in range(0, hsn):
        for j in range(0, wsn):
            corner = [int(i * ns), int(j * ns)]
            oposite_corner = (int(corner[0] + kl), int(corner[1] + kl))

            # keep the segment in bounds even if for the last one we increase the overlay
            if oposite_corner[0] > height:
                corner[0] -= (oposite_corner[0] - height)
            if oposite_corner[1] > width:
                corner[1] -= (oposite_corner[1] - width)

            corners.append((corner[0], corner[1]))

    print(file + " -> " + str(corners), end="\n\n")

def get_segments(img, lines, collumns, overlap, file="file.png", DEBUG: bool = False):
    kernel = get_kernel(img.shape, lines, collumns)
    print(file + " -> " + str(kernel), end="\n")
    segments = segment_image(img, kernel, overlap=overlap, file=file, DEBUG=DEBUG)

    return segments

Main

In [102]:
for file in files:
    ordn = 0

    img = cv2.imread(in_p + file, cv2.IMREAD_COLOR)

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    segments = get_segments(gray, lines=2, collumns=2, overlap=25, file=file, DEBUG=True)

    # bitmap = get_bitmap(gray, file=file, DEBUG=True)

    # print(save_img(img, file, ""))



1287.jpg -> (200, 200)
1287.jpg -> [(0, 0), (0, 150), (0, 200), (150, 0), (150, 150), (150, 200), (200, 0), (200, 150), (200, 200)]

1290.jpg -> (100, 100)
1290.jpg -> [(0, 0), (0, 75), (0, 100), (75, 0), (75, 75), (75, 100), (100, 0), (100, 75), (100, 100)]

1293.jpg -> (225, 225)
1293.jpg -> [(0, 0), (0, 169), (0, 225), (169, 0), (169, 169), (169, 225), (225, 0), (225, 169), (225, 225)]

1297.jpg -> (270, 270)
1297.jpg -> [(0, 0), (0, 203), (0, 270), (203, 0), (203, 203), (203, 270), (270, 0), (270, 203), (270, 270)]

1299.jpg -> (125, 125)
1299.jpg -> [(0, 0), (0, 94), (0, 125), (94, 0), (94, 94), (94, 125), (125, 0), (125, 94), (125, 125)]

1307.jpg -> (250, 250)
1307.jpg -> [(0, 0), (0, 188), (0, 250), (188, 0), (188, 188), (188, 250), (250, 0), (250, 188), (250, 250)]

1309.jpg -> (350, 350)
1309.jpg -> [(0, 0), (0, 263), (0, 350), (263, 0), (263, 263), (263, 350), (350, 0), (350, 263), (350, 350)]

DOT.jpg -> (45, 45)
DOT.jpg -> [(0, 0), (0, 34), (0, 45), (34, 0), (34, 34), (34,