In [34]:
import sys
import math
import cv2
import numpy as np
import glob
import os
from PIL import Image, ImageEnhance
from prettyprinter import pprint

# os.environ['TESSERACT'] = "C:\\Program Files (x86)\\tesseract\\bin"
# os.environ['TESSDATA_PREFIX'] = "C:\\Program Files (x86)\\tesseract\\share\\tessdata" 

class Delimiter:
    def __init__(self, page=None, path=None) -> None:
        self.page = page 
        self.path = path 

    def is_vertical(self, line):
        return line[0]==line[2]

    def is_horizontal(self, line):
        return line[1]==line[3]
        
    def overlapping_filter(self, lines, sorting_index):
        filtered_lines = []
        lines = sorted(lines, key=lambda lines: lines[sorting_index])
        for i in range(len(lines)):
                l_curr = lines[i]
                if(i>0):
                    l_prev = lines[i-1]
                    if ( (l_curr[sorting_index] - l_prev[sorting_index]) > 5):
                        filtered_lines.append(l_curr)
                else:
                    filtered_lines.append(l_curr)    
        return filtered_lines
                
    def detect_lines(self, image, title='default_hover', rho = 1, theta = np.pi/180, threshold = 50, minLinLength = 290, maxLineGap = 6, display = False, write = False):
        # Check if image is loaded fine
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        if gray is None:
            print ('Error opening image!')
            return -1
        dst = cv2.Canny(gray, 50, 150, None, 3)
        cImage = np.copy(image)
        linesP = cv2.HoughLinesP(dst, rho , theta, threshold, None, minLinLength, maxLineGap)
        horizontal_lines = []
        vertical_lines = []
        coords = []
        if linesP is not None:
            #for i in range(40, nb_lines):
            for i in range(0, len(linesP)):
                l = linesP[i][0]
                if (self.is_vertical(l)):
                    vertical_lines.append(l)
                elif (self.is_horizontal(l)):
                    horizontal_lines.append(l)
            horizontal_lines = self.overlapping_filter(horizontal_lines, 1)
            vertical_lines = self.overlapping_filter(vertical_lines, 0)
        if (display):
            # coords = []
            for i, line in enumerate(horizontal_lines):
                coords.append([line[0], line[1], line[2], line[3]])
                # print("Horizontal --> ", line)
                cv2.line(cImage, (line[0], line[1]), (line[2], line[3]), (0,0,0), 3, cv2.LINE_AA)
            for i, line in enumerate(vertical_lines):
                cv2.line(cImage, (line[0], line[1]), (line[2], line[3]), (0,0,0), 3, cv2.LINE_AA)
            cv2.waitKey(0)
            cv2.destroyAllWindows()
        if (write):
            # print("DONE")
            cv2.imwrite(title + ".png", cImage)
        return (horizontal_lines, vertical_lines, coords)

    def get_cropped_image(self, image, x, y, w, h):
        cropped_image = image[ y:y+h , x:x+w ]
        return cropped_image
        
    def get_ROI(self, image, horizontal, vertical, left_line_index, right_line_index, top_line_index, bottom_line_index, offset=4):
        x1 = vertical[left_line_index][2] + offset
        y1 = horizontal[top_line_index][3] + offset
        x2 = vertical[right_line_index][2] - offset
        y2 = horizontal[bottom_line_index][3] - offset
        
        w = x2 - x1
        h = y2 - y1
        
        cropped_image = self.get_cropped_image(image, x1, y1, w, h)
        return cropped_image, (x1, y1, w, h)
    
    def check_keyword(self, line):
        keywords = ['description', 'quantity', 'notes', 'rate', 'hours', 'activity', 'time']
        if 'quantity' in line.lower():
            if 'subtotal' in line.lower():
                return False
            if line.split()[0].lower().strip() == 'quantity total':
                return False
        c = 0
        for w in keywords:
            if w in line.lower():
                # print(w)
                c += 1
        if c>1:
            return True 
        return False 

    def get_table(self, image):
        # img_double_final = cv2.imread(image)
        ih, iw, _ = cv2.imread(image).shape
        default_file = self.page 
        fileName = os.path.split(self.page)[-1].split(".jpg")[0]
        # final_image = self.page 
        img_double_final = cv2.imread(default_file)
        ih, iw, _ = img_double_final.shape
        image_name = os.path.split(image)[-1]
        img_final_copy = np.copy(img_double_final)


        
        # # img = cv22.imread("C:\\Users\\Lenovo\\Downloads\\lines.PNG")
        # gray = cv2.cvtColor(img_final_copy, cv2.COLOR_BGR2GRAY)
        # lsd = cv2.createLineSegmentDetector(0)
        # dlines = lsd.detect(gray)
        # for dline in dlines[0]:
        #     x0 = int(round(dline[0][0]))
        #     y0 = int(round(dline[0][1]))
        #     x1 = int(round(dline[0][2]))
        #     y1 = int(round(dline[0][3]))
        #     cv2.line(img_final_copy, (x0, y0), (x1,y1), 0, 1, cv2.LINE_AA)
        #     # print line segment length
        #     a = (x0-x1) * (x0-x1)
        #     b = (y0-y1) * (y0-y1)
        #     c = a + b
        #     #print(math.sqrt(c))
        # cv2.imwrite('lines.png', img_final_copy)



        horizontal, vertical, coords = self.detect_lines(img_final_copy, display=True, write=False)
        max_x2 = -1
        min_x1 = 99999
        # find min x1 and max x2
        for i in range(len(coords)):
            if coords[i][2] > max_x2:
                max_x2 = coords[i][2]
            if coords[i][0] < min_x1:
                min_x1 = coords[i][0]
        # replace x1 and x2 values with min x1 and max x2 for all rows
        for i in range(len(coords)):
            coords[i][2] = max_x2
            coords[i][0] = min_x1

        pprint(coords)
        ## save coordinates of all horizontal lines

        # text_filename = image_name.split('.jpg')[0]+'_coords.txt'
        # text_file = "..\\horizontal_coordinates\\"+text_filename
        # with open(text_file, "w") as txt_file:
        #     for line in coords:
        #         txt_file.write(' '.join(str(v) for v in line) + "\n")

        ## save images with horizontal lines
        for i in range(len(coords)):
            cv2.line(img_double_final, (coords[i][0], coords[i][1]), (coords[i][2], coords[i][3]), (0,0,0), 3, cv2.LINE_AA)
            if i < len(coords)-1:
                row = [coords[i][0], coords[i][1], coords[i+1][2], coords[i+1][3]]
                roi = img_double_final[row[1]:row[3], row[0]:row[2]]
        command = 'tesseract {} testbox batch.nochop makebox'.format(self.page)
        os.system(command)
        # extract box coordinates
        with open('testbox.box', encoding="utf8") as fp:
            text = fp.read()
        # save character coordinates from box file
        ch_coords = []
        for b in text.splitlines():
            b = b.split(' ')
            ch_coords.append(b)
        # extract line coordinates
        # with open(text_file, encoding="utf8") as fp:
        #     text = fp.read()
        # # save line coords from text file
        # coords = []
        # for b in text.splitlines():
        #     b = b.split(' ')
        #     coords.append(b)
        # save row coordinates from line coordinates
        rows = []
        for i in range(len(coords)-1):
            roi = img_double_final[int(coords[i][1]):int(coords[i+1][3]), int(coords[i][0]):int(coords[i+1][2])]
            rows.append([coords[i][0], coords[i][1], coords[i+1][2], coords[i+1][3]])
        pprint(rows)
        table = []
        ## checking start and end of multiple tables to map row numbers 
        
        # print("\n Number --> ", s1)
        k = 0
        final_rows = []
        for row in rows:
            row_x1 = int(row[0])
            row_y1 = int(row[1])
            row_x2 = int(row[2])
            row_y2 = int(row[3])
            roi = img_double_final[row_y1:row_y2, row_x1:row_x2]
            cv2.imwrite("rows\\row_"+str(k)+".jpg", roi)
            k += 1
        #     print("\n Creating Image Box file...")
        #     imgbox_file = '..\\imagebox\\{}'.format(fileName+f'_t{t}_imagebox{k}')
        #     command = 'tesseract {} {} batch.nochop makebox'.format("..\\rows\\row_"+str(k)+".jpg", imgbox_file)
        #     os.system(command)
        #     if os.stat(imgbox_file+".box").st_size == 0:
        #         # print("hello")
        #         continue
        #     final_rows.append([row_x1, row_y1, row_x2, row_y2])
        #     k += 1
        #     discarded_chars = ['~', '|']
        #     table_row = []
        #     for i in range(len(ch_coords)):
        #             if ch_coords[i][0] in discarded_chars:
        #                 continue
        #             ch_x1 = int(ch_coords[i][1])
        #             ch_y1 = ih-int(ch_coords[i][2])
        #             ch_x2 = int(ch_coords[i][3])
        #             ch_y2 = ih-int(ch_coords[i][4])

        #             if ch_x1 >= row_x1 and ch_y1 >= row_y1 and ch_x2 <= row_x2 and ch_y2 <= row_y2:
        #                 table_row.append([ch_coords[i][0], ch_x1, ch_y1, ch_x2, ch_y2])
        #         table.append(table_row)
        # return table, img_double_final, start, end, final_rows

    # def check_border(self, image):
    #     table, img_double_final, start, end, final_rows = self.get_table(image)
    #     # print("\n TABLE --> ", table)
    #     try:
    #         if len(table)>2:
    #             x1, x2, y1, y2 = table[1][0][1], table[2][-1][3], table[1][0][2], table[2][-1][2]
    #         else:
    #             x1, x2, y1, y2 = table[0][0][1], table[1][-1][3], table[0][0][2], table[1][-1][2]
    #         # print(temp[1], temp[2])
    #         print(x1, x2, y1, y2)
    #         roi = img_double_final[y1:y2, x1:x2]
    #         cv2.imwrite("row.jpg", roi)
    #         h, v, c = self.detect_lines(cv2.imread("row.jpg"),display=True, write=False)
    #         print(h, v, c, "LEN OF HORI --> ", len(h), "IMG --> ", image)
    #         if c:
    #             if len(c)//2 != 1:
    #                 img = add_border.run(image)
    #                 return True, img 
    #         return False, None 
    #     except Exception as err:
    #         print("ERROR --> ", err)
    #         img = add_border.run(image)
    #         return True, img 
    #         # return False, None 
                
    def run(self, argv=[]):
        # res, img  = self.check_border(self.page)
        # print("RES --> ", res)
        # if res and img:
            # self.page = img 
        print("PAGE --> ", self.page)
        # exit()
        table, img_double_final, start, end, final_rows = self.get_table(self.page)
        fileName = os.path.split(self.page)[-1].split(".jpg")[0]
        textFilename = '..\\coords\\{}'.format(fileName+'_coords.txt')
        with open(textFilename, "w") as txt_file:
            for line in table:
                for l in line:
                    txt_file.write(' '.join(str(v) for v in l) + "\n")
        img = cv2.imread(self.page)
        image_name = os.path.split(self.page)[-1]
        k = 0
        filename = image_name.split('.')[0] + "_delimiter.jpg"
        delimiter_img = os.path.join(self.path, filename)
        threshold_value = 36
        for j in range(len(table)):
            sorted_row = sorted(table[j], key=lambda x: x[1])
            for i in range(len(sorted_row)-1):
                if sorted_row[i+1][1] - sorted_row[i][1] > threshold_value:
                    cv2.putText(img, "|", ((sorted_row[i][1]+sorted_row[i+1][1])//2, final_rows[k][1]+30), cv2.FONT_HERSHEY_SIMPLEX,  
                                1, (0, 0, 0), 3, cv2.LINE_AA)
                    cv2.imwrite(delimiter_img, img)
            k += 1
        return delimiter_img

if __name__ == "__main__":
    # obj = Delimiter("../preprocess/Maser_Invoice-Reg2-Conflict-June'16_page0_600_BGR2RGB.jpg")
    for img in glob.glob("C:\\air_ticket\\images2\\71_Yatra_AAAIN233675430_08.09.2022_page0.jpg"):
        obj = Delimiter(img)
        img_final, final_rows, table, start = obj.run()
        # print("\n START --> ", start)
        # print("START 0 --> ", start[0])
        # print(table)
        # if start:
        temp = table[:]
        # print("\n LENGTH --> ", len(temp))
        # print("VALUES --> ", start[0], table[start[0]])
        # print(temp)
        # exit()
        # x1, x2, y1, y2 = final_rows[1][0], final_rows[1][2], final_rows[1][1], final_rows[2][3]
        x1, x2, y1, y2 = temp[-2][0][1], temp[-1][-1][3], temp[-2][0][2], temp[-1][-1][2]
        # print(temp[1], temp[2])
        print(x1, x2, y1, y2)
        roi = img_final[y1:y2, x1:x2]
        # cv2.imshow("roi", roi)
        # cv2.waitKey()
        # print(roi)
        cv2.imwrite("row.jpg", roi)
        h, v, c = obj.detect_lines(cv2.imread("row.jpg"),display=True, write=False)
        print(h, v, c, "LEN OF HORI --> ", len(h), "IMG --> ", img)
        print("START --> ", table[5])



PAGE -->  C:\air_ticket\images2\71_Yatra_AAAIN233675430_08.09.2022_page0.jpg
[
    [990, 1471, 1401, 1471],
    [990, 1712, 1401, 1712],
    [990, 1746, 1401, 1746]
]
[[990, 1471, 1401, 1712], [990, 1712, 1401, 1746]]


TypeError: cannot unpack non-iterable NoneType object

In [14]:
coords = [
    [249, 174, 1432, 174],
    [249, 612, 1432, 612],
    [249, 664, 1432, 664],
    [249, 1061, 1432, 1061],
    [249, 1098, 1432, 1098],
    [249, 1194, 1432, 1194],
    [249, 1229, 1432, 1229],
    [249, 1263, 1432, 1263],
    [249, 1297, 1432, 1297],
    [249, 1469, 1432, 1469],
    [249, 1505, 1432, 1505]
]

img = cv2.imread('C:\\air_ticket\\images2\\113_Yatra_AAAIN233676709_27.09.2022_page0.jpg')

# for coord in coords:
roi = img[174:612, 249:1432]
cv2.imshow('roi', roi)
cv2.waitKey()

-1

In [36]:
for img in glob.glob("C:\\air_ticket\\images2\\71_Yatra_AAAIN233675430_08.09.2022_page0.jpg"):
    obj = Delimiter(img)
    obj.get_table(img)

[
    [990, 1471, 1401, 1471],
    [990, 1712, 1401, 1712],
    [990, 1746, 1401, 1746]
]
[[990, 1471, 1401, 1712], [990, 1712, 1401, 1746]]
