In [62]:
import fitz
from PIL import Image
from numpy import asarray
import numpy as np
import cv2
import os
import pytesseract

In [21]:
def pdf_to_img(filepath, filename):
    doc = fitz.open(filepath)
    page = doc.load_page(1)  # number of page
    pix = page.get_pixmap()
    output = filename[:-4] + ".png"
    pix.save("imgs\\" + output)

In [24]:
def img_to_bw(filepath, filename):
    img_grey = cv2.imread(filepath, cv2.IMREAD_GRAYSCALE)
    thresh = 128
    img_binary = cv2.threshold(img_grey, thresh, 255, cv2.THRESH_BINARY)[1]
    cv2.imwrite("bw_imgs\\" + filename[:-4] + '_bw.png', img_binary)

In [51]:
# returns list of y-value of horizontal lines
# input bw img
# img_bw = Image.open("bw.png")
# 0 = horizontal lines, 1 = vertical lines
def identify_line(img_bw, threshold, line_axis=0):
    data = asarray(img_bw)
    
    # sum of whiteness in each line
    if line_axis == 0:
        white_count = data.sum(axis=1)
    else:
        white_count = np.transpose(data).sum(axis=1)
    indices = list(np.where(white_count < threshold)[0])
    return indices

In [57]:
# given an x or y value, go from top and left to that value
# slice_img(line_axis=0, 0, height//2) crops image and saves top half
# crops from one line to another (a -> b)
def slice_img(img_bw, a, b, line_axis=0, show=False):
    width, height = img_bw.size
    if line_axis == 0:
        crop_img = (0, a, width, b)
    else:
        crop_img = (a, 0, b, height)
    crop_img = img_bw.crop(crop_img)
    if show:
        crop_img.show()
    return crop_img

In [49]:
# finds the horizontal and vertical lines in pdf file
def pdf_to_lines(inp_filename):
    pdf_to_img("pdfs\\" + inp_filename, inp_filename)
    
    img_filename = inp_filename[:-4] + ".png"
    img_to_bw("imgs\\" + img_filename, img_filename)
    
    inp_bw_img = Image.open("bw_imgs\\" + inp_filename[:-4] + "_bw.png")
    
    # if sum of white pixels in a line less than value, total = width||height*255
    sensitivity_h = 50000
    sensitivity_v = 100000
    h_lines = identify_line(inp_bw_img, sensitivity_h, line_axis=0)
    v_lines = identify_line(inp_bw_img, sensitivity_v, line_axis=1)
    return [h_lines, v_lines]

In [54]:
line_indices = pdf_to_lines("test_inp.pdf")
print(line_indices[0])
print(line_indices[1])

[137, 193, 239, 353, 483, 511, 540, 568, 624, 681, 738, 766, 767]
[269, 447]


In [60]:
orig_img = Image.open("imgs\\test_inp.png")
h_slice = slice_img(orig_img, line_indices[0][0], line_indices[0][1], line_axis=0, show=True)
# v_slice = slice_img(h_slice, 0, line_indices[1][0], line_axis=1, show=True)

In [None]:
img_path = ""
images=cv2.imread(img_path)

#convert to grayscale image
gray=cv2.cvtColor(images, cv2.COLOR_BGR2GRAY)

#checking whether thresh or blur
if args["pre_processor"]=="thresh":
	cv2.threshold(gray, 0,255,cv2.THRESH_BINARY| cv2.THRESH_OTSU)[1]
if args["pre_processor"]=="blur":
	cv2.medianBlur(gray, 3)
	
#memory usage with image i.e. adding image to memory
filename = "{}.jpg".format(os.getpid())
cv2.imwrite(filename, gray)
text = pytesseract.image_to_string(Image.open(filename))
os.remove(filename)
print(text)

# show the output images
cv2.imshow("Image Input", images)
cv2.imshow("Output In Grayscale", gray)
cv2.waitKey(0)
