In [46]:
import os
import cv2
import imutils
import xlsxwriter
import numpy as np
from math import sqrt
from scipy.stats import mode
from skimage.feature import canny
from itertools import combinations
from sklearn.cluster import DBSCAN
from difflib import SequenceMatcher
from PIL import Image, ImageDraw, ImageFont
from skimage.transform import hough_line, hough_line_peaks
from paddleocr import PaddleOCR, PPStructure, save_structure_res

### Initialize PaddleOCR

In [23]:
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
table_engine = PPStructure(show_log=True)

[2022/07/24 19:01:23] ppocr DEBUG: Namespace(alpha=1.0, benchmark=False, beta=1.0, cls_batch_num=6, cls_image_shape='3, 48, 192', cls_model_dir='/root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer', cls_thresh=0.9, cpu_threads=10, crop_res_save_dir='./output', det=True, det_algorithm='DB', det_db_box_thresh=0.6, det_db_score_mode='fast', det_db_thresh=0.3, det_db_unclip_ratio=1.5, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_east_score_thresh=0.8, det_fce_box_type='poly', det_limit_side_len=960, det_limit_type='max', det_model_dir='/root/.paddleocr/whl/det/ch/ch_PP-OCRv3_det_infer', det_pse_box_thresh=0.85, det_pse_box_type='quad', det_pse_min_area=16, det_pse_scale=1, det_pse_thresh=0, det_sast_nms_thresh=0.2, det_sast_polygon=False, det_sast_score_thresh=0.5, draw_img_save_dir='./inference_results', drop_score=0.5, e2e_algorithm='PGNet', e2e_char_dict_path='./ppocr/utils/ic15_dict.txt', e2e_limit_side_len=768, e2e_limit_type='max', e2e_model_dir=None, e2e_pgnet_mode='f

In [32]:
input_path = './input/0858/'
#input_path = './binarize/binary/'
#input_path = './output/'
output_path = './output/0858/'
input_images_name = sorted(os.listdir(input_path))
print(input_images_name)

['ada.JPG', 'cgk.JPG', 'cgk_ae.JPG', 'ori.JPG', 'sha.JPG']


### Pre-processing & OCR functions

In [30]:
def skew_angle_hough_transform(image):
    # convert to edges
    edges = canny(image, sigma=1)
    # Classic straight-line Hough transform.
    tested_angles = np.deg2rad(np.arange(75, 105))
    h, theta, d = hough_line(edges, theta=tested_angles)
    
    # find line peaks and angles
    accum, angles, dists = hough_line_peaks(h, theta, d)
    
    # round the angles to 2 decimal places and find the most common angle.
    most_common_angle = mode(np.around(angles, decimals=2))[0]
    
    # convert the angle to degree for rotation.
    skew_angle = np.rad2deg(most_common_angle - np.pi/2)
    print('Angle to correct:', skew_angle)
    return skew_angle

def skew_correct(image):
    angle = skew_angle_hough_transform(image)[0]
    return imutils.rotate(image, angle)

# Draw bounding boxes and text
def draw_image(path, bounds, color='red', width=2):
    myFont = ImageFont.truetype('data/font/msyh.ttc', 32)
    image = Image.open(path)
    image = image.convert('RGB')
    #image = image.rotate(270, expand=True) # uncomment this line if necessary
    draw = ImageDraw.Draw(image)
    for bound in bounds:
        p0, p1, p2, p3 = bound[0]
        draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
        draw.text(p3, bound[1][0], font=myFont, fill=(255, 0, 0))
    return image

In [5]:
def line_detection(path):
    raw = cv2.imread(path, 1)
    gray = cv2.cvtColor(raw, cv2.COLOR_BGR2GRAY)
    binary = cv2.adaptiveThreshold(~gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, -5)
    rows, cols = binary.shape
    
    scale = 50
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
    eroded = cv2.erode(binary, kernel, iterations=1)
    dilated_col = cv2.dilate(eroded, kernel, iterations=1)
    
    #scale = 40
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, rows // scale))
    eroded = cv2.erode(binary, kernel, iterations=1)
    dilated_row = cv2.dilate(eroded, kernel, iterations=1)

    #result = cv2.add(dilated_col, dilated_row)
    result = cv2.bitwise_and(dilated_col, dilated_row)
    #result = cv2.add(result, binary)
    #result = skew_correct(result)
    return result

def distance(p1, p2):
    return sqrt((p1[0]-p2[0])**2 + (p1[1]-p2[1])**2)

def draw_line(path, lines):
    image = Image.open(path)
    draw = ImageDraw.Draw(image)
    for line in lines:
        draw.line([line[0], line[1]], fill='red', width=4)
    return image

def ocr_sub(path, crop_lines):
    image = cv2.imread(path)
    table = []
    for line in crop_lines:
        sub = image[line[0][1]:line[1][1], line[0][0]:line[1][0]]
        result = ocr.ocr(sub)
        text = []
        for bound in result:
            text.append(bound[1][0])
        table.append(text)
        print(text)
    return table

def write_xlsx(path, table):
    img_name = os.path.basename(path).split('.')[0]
    workbook = xlsxwriter.Workbook(output_path + img_name + '.xlsx')
    worksheet = workbook.add_worksheet()
    merge_format = workbook.add_format({'align': 'center'})
    row = 0
    column = 0
    for col in table:
        for item in col:
            worksheet.write(row, column, item)
            row += 1
        row = 0
        column += 1
    max_col_len = max([len(col) for col in table])
    for i in range(len(table)):
        if len(table[i]) == 1:
            worksheet.merge_range(0, i, max_col_len, i, table[i][0], merge_format)
    workbook.close()

### Pre-processing (optional)

In [19]:
image = cv2.imread(input_path + input_images_name[2])
kernel1 = np.array([[0, -1, 0],
                   [-1, 5,-1],
                   [0, -1, 0]])
kernel2 = np.array([[-1, -1, -1],
                   [-1, 10,-1],
                   [-1, -1, -1]])
image_sharp = cv2.filter2D(src=image, ddepth=-1, kernel=kernel1)

cv2.imwrite(input_path+'sha.JPG', image_sharp)

True

In [12]:
image = cv2.imread(input_path + input_images_name[8])
img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
binary = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 199, 20)

kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
eroded = cv2.erode(binary, kernel, iterations=1)
dilated = cv2.dilate(eroded, kernel, iterations=1)
mask = cv2.bitwise_not(eroded)
mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)

result = cv2.bitwise_and(image, mask)
result[mask==0] = 255

cv2.imwrite(input_path+'b1.jpg', binary)

True

### Recognition Accuracy System

In [89]:
def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def read_lines(path):
    file = open(path, 'r')
    lines = file.read().splitlines()
    return lines

def write_lines(lines, path):
    file = open(path, 'w')
    file.write('\n'.join(lines))

def cal_score(result, truth):
    matched_line = 0
    accuracy_sum = 0
    matched_char = 0
    truth_line_usage = [0]*len(truth)
    for res_line in result:
        for i in range(len(truth)):
            tru_line = truth[i]
            if similarity(res_line, tru_line) > 0.7 and truth_line_usage[i] == 0:
                truth_line_usage[i] = 1
                matched_line += 1
                accuracy_sum += similarity(res_line, tru_line)
                matched_char += similarity(res_line, tru_line)*len(tru_line)
    box_coverge = matched_line / len(truth)
    avg_box_accuracy = accuracy_sum / matched_line
    char_sum = sum([len(line) for line in truth])
    avg_char_accuracy = matched_char / char_sum
    
    return box_coverge, avg_box_accuracy, box_coverge*avg_box_accuracy, avg_char_accuracy

### Visualizing OCR results and recognition accuracy

In [90]:
ground_truth = read_lines(input_path + '0858.txt')

# Save output images
for img_name in input_images_name:
    path = input_path + img_name
    #result = ocr.ocr(path, cls=True)
    #draw_image(path, result).save(output_path + img_name)
    #print(path)
    #result_text = [line[1][0] for line in result]
    result_text = read_lines(input_path+img_name[:-3]+'txt')
    #write_lines(result_text, input_path+img_name[:-3]+'txt')
    print(path, cal_score(result_text, ground_truth))

./input/0858/ada.JPG (0.9230769230769231, 0.9682030655600988, 0.8937259066708605, 0.8411707784034108)
./input/0858/cgk.JPG (0.8461538461538461, 0.9623945164698232, 0.8143338216283119, 0.8676324395850087)
./input/0858/cgk_ae.JPG (0.8615384615384616, 0.9545207415787943, 0.8223563312063459, 0.9169844523379639)
./input/0858/ori.JPG (0.8769230769230769, 0.971759656144054, 0.852158467695555, 0.9022214365011935)
./input/0858/sha.JPG (0.8615384615384616, 0.9491619269907278, 0.8177395063304732, 0.8718571899046756)


### Recognizing table directly using API

In [8]:
for img_name in input_images_name:
    path = input_path + img_name
    subfolder = os.path.basename(path).split('.')[0]
    img = cv2.imread(path)
    result = table_engine(img)
    save_structure_res(result, output_path, subfolder)
    print(path)

./input/ucl/06DACF2E-1C2B-4BEE-8404-C6688D40F0DB.jpeg
./input/ucl/4D8D63EE-347E-4BA9-A87B-1762537D4A11.jpeg
./input/ucl/66358E1A-F799-466C-B6C2-9F91C2A134EE.jpeg
./input/ucl/694EE917-1594-4700-BEEB-E6AAFF5E0F70.jpeg
./input/ucl/B0523B2E-187A-4B30-A14F-D011195BA687.jpeg
./input/ucl/BD0B4AE8-9630-44F3-A02F-8264A84F7C05.jpeg
./input/ucl/E11B732C-86CB-4E86-BFDD-4100966B31AB.jpeg


### Recognizing Type-3 (Fapiao) table

In [None]:
for img_name in input_images_name[-1:]:
    path = input_path + img_name
    intersection = line_detection(path)
    ys, xs = np.where(intersection > 0)
    points = list(zip(xs, ys))
    anchors = [points[0]]
    for point in points[1:]:
        isvalid = True
        for anchor in anchors:
            if distance(anchor, point) < 50:
                isvalid = False
        if isvalid:
            anchors.append(point)
    #print(anchors)
    ys = np.array([point[1] for point in anchors]).reshape(-1, 1)
    clustering = DBSCAN(eps=50, min_samples=5).fit(ys)
    #print(clustering.labels_)
    labels = list(clustering.labels_)
    clusters = {}
    for i in range(len(labels)):
        if labels[i] >= 0:
            if labels[i] not in clusters:
                clusters[labels[i]] = []
            clusters[labels[i]].append(anchors[i])
    line = 0
    for cluster in clusters.values():
        cluster.sort(key=lambda tup: tup[0])
        print('Line ' + str(line) + ':', cluster)
        line += 1
    #print(clusters)
    crop_point = []
    for i in range(len(clusters)-1):
        comb1 = list(combinations(clusters[i], 2))
        comb2 = list(combinations(clusters[i+1], 2))
        found = (0, 0)
        for c1 in comb1:
            if found == c1[0]:
                continue
            for c2 in comb2:
                p1, p2, p3, p4 = c1[0], c1[1], c2[0], c2[1]
                if abs(p1[0]-p3[0])<25 and abs(p2[0]-p4[0])<25:
                    crop_point.append([p1, p4])
                    found = p1
                    break
    #print(crop_point)
    for pair in crop_point:
        print(pair)
    #print(len(crop_point))
    table = ocr_sub(path, crop_point)
    write_xlsx(path, table)
    draw_line(path, crop_point).save(output_path + img_name)
    print(path)

### Visualizing table cell detection

In [None]:
for img_name in input_images_name[-1:]:
    path = input_path + img_name
    borders = line_detection(path)
    result = borders
    cv2.imwrite(output_path+img_name, result)
    print(path)