In [None]:
import os
import cv2
import imutils
import xlsxwriter
import numpy as np
from math import sqrt
from scipy.stats import mode
from skimage.feature import canny
from itertools import combinations
from sklearn.cluster import DBSCAN
from PIL import Image, ImageDraw, ImageFont
from skimage.transform import hough_line, hough_line_peaks
from paddleocr import PaddleOCR, PPStructure, save_structure_res

### Initialize PaddleOCR

In [None]:
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
table_engine = PPStructure(layout=False, show_log=False)

In [None]:
input_path = './input/originals/'
#input_path = './binarize/binary/'
#input_path = './output/'
output_path = './output/'
input_images_name = sorted(os.listdir(input_path))
print(input_images_name)

In [None]:
def skew_angle_hough_transform(image):
    # convert to edges
    edges = canny(image, sigma=1)
    # Classic straight-line Hough transform.
    tested_angles = np.deg2rad(np.arange(75, 105))
    h, theta, d = hough_line(edges, theta=tested_angles)
    
    # find line peaks and angles
    accum, angles, dists = hough_line_peaks(h, theta, d)
    
    # round the angles to 2 decimal places and find the most common angle.
    most_common_angle = mode(np.around(angles, decimals=2))[0]
    
    # convert the angle to degree for rotation.
    skew_angle = np.rad2deg(most_common_angle - np.pi/2)
    print('Angle to correct:', skew_angle)
    return skew_angle

def skew_correct(image):
    angle = skew_angle_hough_transform(image)[0]
    return imutils.rotate(image, angle)

In [None]:
def line_detection(path):
    raw = cv2.imread(path, 1)
    gray = cv2.cvtColor(raw, cv2.COLOR_BGR2GRAY)
    binary = cv2.adaptiveThreshold(~gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, -5)
    rows, cols = binary.shape
    
    scale = 50
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
    eroded = cv2.erode(binary, kernel, iterations=1)
    dilated_col = cv2.dilate(eroded, kernel, iterations=1)
    
    #scale = 40
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, rows // scale))
    eroded = cv2.erode(binary, kernel, iterations=1)
    dilated_row = cv2.dilate(eroded, kernel, iterations=1)

    #result = cv2.add(dilated_col, dilated_row)
    result = cv2.bitwise_and(dilated_col, dilated_row)
    #result = cv2.add(result, binary)
    #result = skew_correct(result)
    return result

def distance(p1, p2):
    return sqrt((p1[0]-p2[0])**2 + (p1[1]-p2[1])**2)

def draw_line(path, lines):
    image = Image.open(path)
    draw = ImageDraw.Draw(image)
    for line in lines:
        draw.line([line[0], line[1]], fill='red', width=4)
    return image

def ocr_sub(path, crop_lines):
    image = cv2.imread(path)
    table = []
    for line in crop_lines:
        sub = image[line[0][1]:line[1][1], line[0][0]:line[1][0]]
        result = ocr.ocr(sub)
        text = []
        for bound in result:
            text.append(bound[1][0])
        table.append(text)
        print(text)
    return table

def write_xlsx(path, table):
    img_name = os.path.basename(path).split('.')[0]
    workbook = xlsxwriter.Workbook(output_path + img_name + '.xlsx')
    worksheet = workbook.add_worksheet()
    merge_format = workbook.add_format({'align': 'center'})
    row = 0
    column = 0
    for col in table:
        for item in col:
            worksheet.write(row, column, item)
            row += 1
        row = 0
        column += 1
    max_col_len = max([len(col) for col in table])
    for i in range(len(table)):
        if len(table[i]) == 1:
            worksheet.merge_range(0, i, max_col_len, i, table[i][0], merge_format)
    workbook.close()

### Recognizing table directly using API

In [None]:
for img_name in input_images_name[:1]:
    path = input_path + img_name
    subfolder = os.path.basename(path).split('.')[0]
    img = cv2.imread(path)
    result = table_engine(img)
    save_structure_res(result, output_path, subfolder)

### Recognizing type-3 (Fapiao) table

In [None]:
for img_name in input_images_name[-1:]:
    path = input_path + img_name
    intersection = line_detection(path)
    ys, xs = np.where(intersection > 0)
    points = list(zip(xs, ys))
    anchors = [points[0]]
    for point in points[1:]:
        isvalid = True
        for anchor in anchors:
            if distance(anchor, point) < 50:
                isvalid = False
        if isvalid:
            anchors.append(point)
    #print(anchors)
    ys = np.array([point[1] for point in anchors]).reshape(-1, 1)
    clustering = DBSCAN(eps=50, min_samples=5).fit(ys)
    #print(clustering.labels_)
    labels = list(clustering.labels_)
    clusters = {}
    for i in range(len(labels)):
        if labels[i] >= 0:
            if labels[i] not in clusters:
                clusters[labels[i]] = []
            clusters[labels[i]].append(anchors[i])
    line = 0
    for cluster in clusters.values():
        cluster.sort(key=lambda tup: tup[0])
        print('Line ' + str(line) + ':', cluster)
        line += 1
    #print(clusters)
    crop_point = []
    for i in range(len(clusters)-1):
        comb1 = list(combinations(clusters[i], 2))
        comb2 = list(combinations(clusters[i+1], 2))
        found = (0, 0)
        for c1 in comb1:
            if found == c1[0]:
                continue
            for c2 in comb2:
                p1, p2, p3, p4 = c1[0], c1[1], c2[0], c2[1]
                if abs(p1[0]-p3[0])<25 and abs(p2[0]-p4[0])<25:
                    crop_point.append([p1, p4])
                    found = p1
                    break
    #print(crop_point)
    for pair in crop_point:
        print(pair)
    #print(len(crop_point))
    table = ocr_sub(path, crop_point)
    write_xlsx(path, table)
    draw_line(path, crop_point).save(output_path + img_name)
    print(path)

### Visualizing table cell detection

In [None]:
for img_name in input_images_name[-1:]:
    path = input_path + img_name
    borders = line_detection(path)
    result = borders
    cv2.imwrite(output_path+img_name, result)
    print(path)