### Imports

In [1]:
import os
import PyPDF2
import glob
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
from pytesseract import Output
from pytesseract import TesseractError
import numpy as np
import imutils
import shutil
import cv2
import re
import time

### Paths

In [2]:
#################################################
##                                             ##
##         Paths to folders containing:        ##
##                                             ##
##               1. Raw PDF Files              ##
##          2. Identified PDF & Images         ##
##         3. Unidentified PDF & Images        ##
##                                             ##
#################################################

rawPDF = 'C:\\Users\\MUNEEB\\OCR\\Raw PDFs\\'
scannedPDF1 = 'C:\\Users\\MUNEEB\\OCR\\1.1\\PDF and Images\\'
scannedPDF2 = 'C:\\Users\\MUNEEB\\OCR\\1.1\\Unidentified PDF\\'

### Start Time

In [3]:
start = time.time()

### Text Matching

In [4]:
#################################################
##                                             ##
##       This function takes text of img       ##
##       and returns reference Number if       ##
##       text matches pattern else False       ##
##                                             ##
#################################################

def TextMatch(text):

    # raw string for patterns

    pattern1 = r'00\d{8}.{1,25}Quie|00\d{8}.{1,25}uier|00\d{8}.{1,30}iero'
    pattern2 = r'00\d{8}.{5,20}este|00\d{8}.{10,25}form|00\d{8}.{10,25}mular'
    pattern3 = r'encl.{1,80}00\d{8}|closed.{1,80}00\d{8}|refer.{5,15}num.{1,10}00\d{8}'
    refPattern = r"00\d{8}"

    # re.compile with DOTALL enables 
    # '.' to include next line too.

    regex1 = re.compile(pattern1, re.DOTALL)
    regex2 = re.compile(pattern2, re.DOTALL)
    regex3 = re.compile(pattern3, re.DOTALL)

    match1 = re.search(regex1, text)
    match2 = re.search(regex2, text)
    match3 = re.search(regex3, text)

    if match1 or match2 or match3:
        
        if match1:
    
            matchedText = match1.group(0)
    
        elif match2:
    
            matchedText = match2.group(0)

        elif match3:

            matchedText = match3.group(0)
            
        referenceMatch = re.search(refPattern, matchedText)
            
        referenceNo = referenceMatch.group(0)

        return referenceNo

    else:

        return False

### Rotation Angle

In [5]:
#################################################
##                                             ##
##       This function take img as input       ##
##       reads it in OSD mode and return       ##
##       an angle by which img should be       ##
##                   rotated.                  ##
##                                             ##
#################################################
    
def RotationAngle(rawIMG):
    
    try:
        
        results = pytesseract.image_to_osd(rawIMG, output_type=Output.DICT)\
        
        return results['rotate']
    
    except TesseractError as e:
        
        return -1

### Image Rotation

In [6]:
#################################################
##                                             ##
##    This function take image and rotation    ##
##     angle as input and returns rotated      ##
##               image as output.              ##
##                                             ##
#################################################

def RotateImage(rawIMG, rotation):
    
    rotated = imutils.rotate_bound(rawIMG, angle=rotation)
    
    return rotated

### Extracting Reference Number from Image

In [7]:
def Reference(img, angle):

    if angle:

        img = RotateImage(img, angle)

    # extract text from images
        
    text = pytesseract.image_to_string(img)

    reference = TextMatch(text)

    if reference: return reference

    else: return False

### Image Selection

In [8]:
#################################################
##                                             ##
##     This function takes List of images      ##
##     of each pdf and saves image if ref      ##
##     number found and returns true. Ref      ##
##     number image not found then return      ##
##                    False.                   ##
##                                             ##
#################################################

def ImageSelection(path, imgList):
    
    # Default value of found set to False

    found = False

    # List of angles on which image will be OCR'ed

    angles = [0, 90, 180, 270]

    for img in imgList:

        for angle in angles:

            reference = Reference(img, angle)

            if reference:

                # Gets the rotation angle of image
        
                rotation = RotationAngle(img)
        
                if rotation != -1:
        
                    # Gets the rotated image with correct orientation
        
                    newImage = RotateImage(img, rotation)

                cv2.imwrite(path + reference + '.png', newImage)

                found = True
                break

        if found: break
    
    return found, reference

### PDF to image conversion

In [9]:
def PDFtoImg(path1, path2, path3):
    
    # Total PDF count

    PDFcount1 = 0

    # Identified PDF count

    PDFcount2 = 0
    
    os.chdir(path1)
    
    pattern = '*.pdf'

    pdfNames = glob.glob(pattern)
    
    for file in pdfNames:

        PDFcount1 += 1

        # Gets a list of PIL images of each pdf page
        
        images = convert_from_path(path1 + file)

        # Converts each PIL Image to np.array first
        # Convert np.array to cv2 n-dimension array
        # Also Converts RGB to Gray Scale.
        # Adds each image to images List.

        imgList = []
        
        for img in images:

            gray = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2GRAY)

            imgList.append(gray)
        
        found, reference = ImageSelection(path2, imgList)

        if found:

            PDFcount2 += 1

            name = reference + '.pdf'

            oldPath = os.path.join(path1, file)

            newPath = os.path.join(path2, name)

            shutil.copy(oldPath, newPath)
        
        else:

            oldPath = os.path.join(path1, file)

            newPath = os.path.join(path3, file)

            shutil.copy(oldPath, newPath)

    return PDFcount1, PDFcount2 

### Processing

In [10]:
total, identified = PDFtoImg(rawPDF, scannedPDF1, scannedPDF2)

### End Time

In [11]:
end = time.time()

### Execution Time

In [12]:
execution = "{:.2f}".format(end - start)

### Result: 

In [13]:
print(f"Execution time: {execution}s\n")

# Prints total and identified PDFs

print(f"Total PDF: {total}")
print(f"Identified PDF: {identified}")
print(f"\nAccuracy: {identified/total*100}")

Execution time: 330.67s

Total PDF: 40
Identified PDF: 31

Accuracy: 77.5
