In [1]:
import pytesseract as pt
import numpy as np
import cv2
import matplotlib.pyplot as plt
import os
from pdf2image import convert_from_path

In [2]:
def area(approx):
    x = approx[:,0,0]
    y = approx[:,0,1]
    return (max(x)-min(x))*(max(y)-min(y))

def start_point(approx):
    x = approx[:,0,0]
    y = approx[:,0,1]
    return (min(x),max(y))

def end_point(approx):
    x = approx[:,0,0]
    y = approx[:,0,1]
    return (max(x),min(y))

def get_points(approx):
    x = approx[:,0,0]
    y = approx[:,0,1]
    return (min(x),max(x),min(y),max(y))

def get_all_points(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    coordinates = []
    for contour in contours:
        epsilon = 0.05 * cv2.arcLength(contour, True)
        approx = cv2.approxPolyDP(contour, epsilon, True)
        if len(approx) == 4:
            if (area(approx)>5000):
                coordinates.append(get_points(approx))
#                 print(area(approx))
#                 cv2.rectangle(image, start_point(approx),end_point(approx), (0, 255, 255), 5)
    return coordinates

def get_tables(image):
    coordinates = get_all_points(image)
#     print(len(coordinates))
    tables = []
    for coordinate in coordinates:
        tables.append(image[coordinate[2]:coordinate[3],coordinate[0]:coordinate[1]])
    return tables

def get_image_with_table(img):
    image = img
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    coordinates = []
    for contour in contours:
        epsilon = 0.05 * cv2.arcLength(contour, True)
        approx = cv2.approxPolyDP(contour, epsilon, True)
        if len(approx) == 4:
            if (area(approx)>5000):
                coordinates.append(get_points(approx))
#                 print(area(approx))
                cv2.rectangle(image, start_point(approx),end_point(approx), (0, 255, 255), 5)
    return image

In [3]:
books_punjabi = sorted(['data_books/punjabi/'+book for book in os.listdir('data_books/punjabi/')])

In [13]:
images = convert_from_path(books_punjabi[4],dpi=300)

In [14]:
# image = cv2.imread('Screenshot from 2024-04-19 22-58-45.png')
tables = [get_tables(np.array(image)) for image in images]
# tables = [cv2.cvtColor(table, cv2.COLOR_BGR2GRAY) for table in tables]
# th, im_gray_th_otsu = cv2.threshold(tables[0], 0, 255, cv2.THRESH_OTSU)

In [None]:
table_small = cv2.imread('table-small-2.png')
table_large = cv2.imread('table-large-2.png')

In [None]:
plt.figure(figsize=(15,15))
plt.imshow(ass)

In [4]:
def get_cells_coordinates(image):
    img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh1,img = cv2.threshold(img,128,255,cv2.THRESH_OTSU)
    img = cv2.bitwise_not(img)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, img.shape[1]//150))
    hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (np.array(img).shape[1]//150, 1))

    vertical_lines = cv2.erode(img, vertical_kernel, iterations=5)
    vertical_lines = cv2.dilate(vertical_lines, vertical_kernel, iterations=5)

    horizontal_lines = cv2.erode(img, hor_kernel, iterations=30)
    horizontal_lines = cv2.dilate(horizontal_lines, hor_kernel, iterations=30)

    vertical_horizontal_lines = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
    vertical_horizontal_lines = cv2.erode(~vertical_horizontal_lines, kernel, iterations=3)

    thresh, vertical_horizontal_lines = cv2.threshold(vertical_horizontal_lines,128,255,cv2.THRESH_OTSU)
    b_image = cv2.bitwise_not(cv2.bitwise_xor(img,vertical_horizontal_lines))

    contours, hierarchy = cv2.findContours(vertical_horizontal_lines, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    boundingBoxes = [cv2.boundingRect(c) for c in contours]
    (contours, boundingBoxes) = zip(*sorted(zip(contours, boundingBoxes),key=lambda x:x[1][1]))

    return boundingBoxes

def get_text_csv(bounding_boxes,image):
    cols = len(np.unique(np.array(bounding_boxes)[:,0]))
    rows = len(np.unique(np.array(bounding_boxes)[:,1]))
    
    rows_start_points = sorted(np.unique(np.array(bounding_boxes)[:,1]))
    cols_start_points = sorted(np.unique(np.array(bounding_boxes)[:,0]))
    rows_end_points = sorted(np.unique(np.array(bounding_boxes)[:,1]+np.array(bounding_boxes)[:,3]))
    cols_end_points = sorted(np.unique(np.array(bounding_boxes)[:,0]+np.array(bounding_boxes)[:,2]))
    
    table = []
    for i in range(rows):
        table.append([])
        for j in range(cols):
            table[i].append(())

    for idx,box in enumerate(bounding_boxes):
        col_start, row_start, w, h = box
        row_end, col_end = row_start+h,col_start+w
        #     print(rows_start_points.index(row_start)-rows_end_points.index(row_end))
        for i in range(rows_start_points.index(row_start),rows_end_points.index(row_end)+1):
            for j in range(cols_start_points.index(col_start),cols_end_points.index(col_end)+1):
                table[i][j]=box
#                 print(i,j,idx)
#     display_cell(image,table[7][0])
    # text = []
    all_text = ''
    for idx,row in enumerate(table):
    #     text.append([])
        for cell in row:
            x, y, w, h = cell
            ocr_text = pt.image_to_string(image[y:y+h,x:x+w],config='--psm 6',
                                          lang='pan').replace('\n','').replace('\x0c','').replace('_','')
#             print(ocr_text)
            all_text = all_text+ocr_text+','
        #         text[idx].append(ocr_text)
        all_text=all_text+'\n'
    return all_text

In [None]:
bounding_boxes = get_cells_coordinates(table_large)

In [5]:
def display_cell(image,bounding_box):
    x, y, w, h = bounding_box
    plt.imshow(image[y:y+h,x:x+w])
    print(pt.image_to_string(image[y:y+h,x:x+w],config='--psm 6 --oem 3',lang='pan'))

In [None]:
plt.imsave('table-1cell.png',tables[210][0])

In [None]:
ass = cv2.imread('table-1cell.png')

In [None]:
bounding_boxes = get_cells_coordinates(ass)
ocr = get_text_tsv(bounding_boxes,ass)

In [10]:
all_text = ''
for idx,table in enumerate(tables):
#     print(idx)
    all_text = all_text + f'page-{idx}\n'
    for curr in table:
        try:
#             print(idx,'asd')
            bounding_boxes = get_cells_coordinates(curr)
#             print(len(bounding_boxes))
            ocr = get_text_csv(bounding_boxes,curr)
            all_text = all_text + ocr + '\n'
        except:
            continue
    all_text = all_text + '\n\n\n\n'

In [12]:
output_file = 'data_books/ocr_punjabi/pp_kharifs_tables.txt'
f=open(output_file,'w')
f.write(all_text)
f.close()

In [18]:
all_text = ''
for idx,table in enumerate(tables):
    all_text = all_text + f'page-{idx}\n'
    for curr in table:
        try:
            ocr = pt.image_to_string(curr,config='--psm 6',lang='pan')
            all_text = all_text + ocr + '\n'
        except:
            continue
    all_text = all_text + '\n\n\n\n'

In [20]:
output_file = 'data_books/ocr_punjabi/pp_kharifs_notes.txt'
f=open(output_file,'w')
f.write(all_text)
f.close()

In [21]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
