In [1]:
#libraries to import
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
import cv2
import json
from nltk.corpus import words
import nltk

In [None]:
nltk.download()

In [4]:
# We use the nltk corpus to generate the words we're going to use as reference
dicto = words.words()

def post_process(string):

    # The plan is to split the file by lines first, then split the lines into words to only keep the ones contained in the
    # English dictionnary.
    stringlist2 = string.split("\n")
    
    # We're going to iterate through all the lines,
    for j in range(0, len(stringlist2)):
        # Split the line into words,
        stringlist = stringlist2[j].split()
        # Introduce a garbage counter
        isGarbage = 0
        # We're iterating through all the words
        for i in range(0, len(stringlist)):
            if stringlist[i].isalpha() and stringlist[i] is not None:
                # If the word isn't in the dictionnary we add one to the garbage counter
                if stringlist[i].lower() not in dicto and stringlist[i][:-1] not in dicto:
                    isGarbage += 1
        # If there are enough garbage words in a sentence, the program deletes the sentence entirely.
        if isGarbage >= 2:
            string = string.replace(stringlist2[j], "")

    return string

In [210]:
#getting the lists of all the pdfs
list_pdfs = os.listdir('CanDev_Scanned_Documents-master')

In [213]:
#takes one pdf at a time from list_pdfs, saves the pdf as images (i.e an eight page pdf will have 8 png files), gets the horizontal lines
#from the png files, compares the number of horizontal lines to a threshold, if it is greater, then text is extracted from that image, the 
#text is stored as a JSON object.
#the idea for such implementation is that pages with tables have higher number of horizontal lines. So getting such lines will help us to get 
#pages with tables.
for PDF_file in list_pdfs:
    #converts pdf to image files
    pages = convert_from_path('CanDev_Scanned_Documents-master/'+PDF_file, 500, poppler_path=r'poppler\poppler-0.68.0\bin')
    image_counter = 1
    
    #getting the image files one at a time and storing them as png files.
    for page in pages:
        filename = "Shared/page_"+str(image_counter)+".png"
        page.save(filename, 'PNG')
        image_counter = image_counter + 1
        
    filelimit = image_counter-1
    
    #taking images one by one and performing operations
    for i in range(1, filelimit + 1):
        filename = "Shared/page_"+str(i)+".png"
        #reads the image using OpenCV and converting it to numpy array with horizontal lines
        src = cv2.imread(filename, cv2.IMREAD_COLOR)
        gray = cv2.cvtColor(src, cv2.COLOR_BGR2GRAY)
        gray = cv2.bitwise_not(gray)
        bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, \
                                        cv2.THRESH_BINARY, 15, -2)
        horizontal = np.copy(bw)
        cols = horizontal.shape[1]
        horizontal_size = cols // 30
        
        # Create structure element for extracting horizontal lines through morphology operations
        horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
        
        # Apply morphology operations
        horizontal = cv2.erode(horizontal, horizontalStructure)
        horizontal = cv2.dilate(horizontal, horizontalStructure)
        
        #counting the number of horizontal lines based on a theshold value
        lines = cv2.HoughLinesP(horizontal,rho = 1,theta = 1*np.pi/180,threshold = 1500,minLineLength = 10,maxLineGap = 10)
        
        #lines variable can be empty if no lines are detected so checking if the variable lines is an np array or not
        #if not then it would be disregarded as no lines were detected
        if isinstance(lines, np.ndarray) == True:
            #checking if number of horizontal lines is greater thatn a thershold value
            if len(lines)>25:
                
                #getting the text out of the images if it thinks it is a table
                text = str(((pytesseract.image_to_string(Image.open(filename), config='--psm 6'))))
                text = text.replace('|', '')
                
                #performing some post_processing steps to get rid of garbagge words
                text = post_process(text)
                
                #splitting the string based on new lines
                lst = text.split('\n')
                
                #removing the first and last items from the list as it mainly contains page number
                lst = lst[1:-1]
                
                #considering the first two lines as title of the table
                title = lst[0] + lst[1]
                
                #the rest of the lines will be considered for the contents of the table
                lst = lst[2:]
                
                #creating an empty dictionary and storing the table rows in key value pair (row#, content of the row)
                a_dict = {}
                a_dict["Title"] = title
                count = 0
                for i in lst:
                    if len(i)>3:
                        count+=1
                        key = "row "+str(count)
                        a_dict[key] = i
                
                #converting the dictionary into a JSON object
                json_object = json.dumps(a_dict, indent = 4)
                
                #saving the JSON files
                with open('Shared/'+PDF_file[0:-4] +'_' +filename[7:-4] + '.json', 'w') as f:
                    json.dump(json_object, f)
                    
    #removing the png files as we just need the content from the tables
    for file in os.listdir('Shared'): 
        if file.endswith('.png'):
            os.remove('Shared/'+file) 

In [None]:
#a function to rotate a page if it has got an angle of 90 degrees
def rotate_bound(image, angle):
    """Rotate image with the given angle
    :param type image: input image
    :param type angle: Angle to be rotated
    :return: rotated image
    :rtype: numpy.ndarray
    """
    (h, w) = image.shape[:2]
    ### centroid
    (cX, cY) = (w // 2, h // 2)
    ### creating rotation matrix
    M = cv2.getRotationMatrix2D((cX, cY), -angle, 1.0)
    cos = np.abs(M[0, 0])
    sin = np.abs(M[0, 1])
    nW = int((h * sin) + (w * cos))
    nH = int((h * cos) + (w * sin))
    M[0, 2] += (nW / 2) - cX
    M[1, 2] += (nH / 2) - cY
    return cv2.warpAffine(image, M, (nW, nH))

In [None]:
#this is another approach to get the tables from the images. In this approach we have counted the number of numeric values in a page and see if it 
#crosses a threshold value. Our motivation here is that the images with tables have higher number of numeric values like 12,300 or 12,300.00
#or decimal or integer values. Also, in this approach we have considered the pages which have an angle of 90 degree. We have rotated such images
#because we have checked that tesseract can not extract text correctly from such images. 
for PDF_file in list_pdfs:
    pages = convert_from_path('CanDev_Scanned_Documents-master/'+PDF_file, 500, poppler_path=r'poppler\poppler-0.68.0\bin')
    image_counter = 1
    #count = 0
    for page in pages:
        filename = "num_count/page_"+str(image_counter)+".png"
        page.save(filename, 'PNG')
        image_counter = image_counter + 1
        
    filelimit = image_counter-1
    for i in range(1, filelimit + 1):
        filename = "num_count/page_"+str(i)+".png"
        #count+=1
        
        image=cv2.imread(filename)
        
        #checking if the angle of the image is 90 degree.
        newdata=pytesseract.image_to_osd(image)
        angle=re.search('(?<=Rotate: )\d+', newdata).group(0)
        print('osd angle:',angle)
        if angle=='90':
            #if the angle is 90 degree then it is rotaing the image
            skew_corrected_image=rotate_bound(image,float(angle))
        else:
            skew_corrected_image = image
        
        #getting the text from an image to count the numeric values in it
        text_tmp = str(((pytesseract.image_to_string(skew_corrected_image, config='--psm 6'))))
        
        '''
        (finds commas) 12,300 or 12,300.00
        '[\d]+[.,\d]+'
        
        (finds floats) 0.123 or .123
        '[\d]*[.][\d]+'
        
        (finds integers) 123
        '[\d]+'
        '''
        
        p = '[\d]+[.,\d]+|[\d]*[.][\d]+|[\d]+'
        count = 0
        if re.search(p, text_tmp) is not None:
            #using re to get the numeric values from the string and counting the number of such occurences
            for catch in re.finditer(p, text_tmp):
                count+=1
                
        #if the count is greater than 30 then it will be considered as a table
        if count>30:
            
            #getting the text out of the images if it thinks it is a table
            text = str(((pytesseract.image_to_string(skew_corrected_image, config='--psm 6'))))
            text = text.replace('|', '')
            
            #performing some post_processing steps to get rid of garbagge words
            text = post_process(text)
                
            #splitting the string based on new lines
            lst = text.split('\n')
            
            #removing the first and last items from the list as it mainly contains page number
            lst = lst[1:-1]
            
            #considering the first two lines as title of the table
            title = lst[0] + lst[1]
            
            #the rest of the lines will be considered for the contents of the table
            lst = lst[2:]
            
            #creating an empty dictionary and storing the table rows in key value pair (row#, content of the row)
            a_dict = {}
            a_dict["Title"] = title
            count = 0
            for i in lst:
                if len(i)>3:
                    count+=1
                    key = "row "+str(count)
                    a_dict[key] = i
            
             #converting the dictionary into a JSON object
            json_object = json.dumps(a_dict, indent = 4)
            
            #saving the JSON files
            with open('num_count/'+PDF_file[0:-4] +'_' +filename[10:-4] + '.json', 'w') as f:
                json.dump(json_object, f)
                
    #removing the png files as we just need the content from the tables
    for file in os.listdir('num_count'): 
        if file.endswith('.png'):
            os.remove('num_count/'+file) 