In [22]:
import fitz
import cv2
import numpy as np
import os
import pytesseract

pdf_dict = {}
file = r"file path"

def get_contours(image):
    # remove color info
    gray_image= image[:,:,0]

    # (1) thresholding image
    ret,thresh_value = cv2.threshold(gray_image,180,255,cv2.THRESH_BINARY_INV)

    # (2) dilating image to glue letter with e/a
    kernel = np.ones((2,2),np.uint8)    
    dilated_value = cv2.dilate(thresh_value,kernel,iterations = 1)

    # (3) looking for countours
    contours, hierarchy = cv2.findContours(dilated_value,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)

    # (4) extracting coordinates and filtering them empirically 
    coordinates = []
    for contour in contours:
        x,y,w,h = cv2.boundingRect(contour)
        if h> 50 and w>50 and h*w<350000:  
            coordinates.append((x,y,w,h))
    return coordinates

def sort2(val):   #helper for sorting by y
    return val[1]  

def recognize_table(image,coordinates):
    recognized_table = row = []
    prev_y = 0
    coordinates.sort() #sort by x
    coordinates.sort(key = sort2) # sort by y
    for coord in coordinates:
        x,y,w,h = coord
        if y>prev_y+5: #new row if y is changed
            recognized_table.append(row)
            row = [] 
        crop_img = image[y:y+h, x:x+w]
        recognized_string = pytesseract.image_to_string(crop_img, lang="rus")
        row.append(recognized_string.replace("\n"," "))
        prev_y = y
    return recognized_table

In [None]:
pages = []
doc = fitz.open(file)
for n in range(doc.pageCount):
    page = doc.loadPage(n)
    pix = page.getPixmap() 
    image = np.frombuffer(pix.samples, 
        dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
    image = np.ascontiguousarray(image[..., [2, 1, 0]])  
    pages.append(image)  

In [24]:
for page in pages:
    contours_coords = get_contours(page)
    print(recognize_table(page,contours_coords))