## Process scanned PDFs with tesseract OCR.

### Prerequisites 
* Anaconda, run "conda env create" when in project dir to get packages.
* ImageMagick. https://imagemagick.org/script/download.php
* Possibly only runs on mac.

In [None]:
# Check python version to check that enviroment is correct.
# from platform import python_version
# print(python_version())

import subprocess
import os
import sys
import glob
from io import BytesIO
from PIL import Image, ImageSequence
import pytesseract

# saveDir will be where txt files are saved, the pdfPathList should point to PDFs.
dir = os.path.dirname(os.path.abspath("__file__"))
saveDir = os.path.join(dir, "Redovisningar/Scan/raw")
pdfPathList = glob.glob(os.path.join(dir, "Redovisningar/Scan/*.pdf"))

# Function that takes a filePath to a pdf and OCRs text to a txt file at saveDir.
def ocrPdf(filePath, saveDir):    
    # Use subprocess to access ImageMagick CLI and convert PDF to TIF.
    # Send results to stdout.
    cmd = ["convert", "-density", "300", filePath, "-type", "TrueColor", "tif:-"]
    fconvert = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = fconvert.communicate()
    assert fconvert.returncode == 0, stderr
    
    # Open txt file for output.
    file = open(os.path.join(saveDir, os.path.basename(filePath)[:-3] + "txt"), "w")
    
    # Convert TIF in stdout into BytesIO object to read with pillow
    # Iterate through TIF pages and scan with OCR, write results to file. 
    with Image.open(BytesIO(stdout)) as image:
        for i, page in enumerate(ImageSequence.Iterator(image)):
            file.write(pytesseract.image_to_string(page))
    
    # We are done, close file.
    file.close()


# Main stuff, go through list of PDF and call ocrPdf.
i = 1
for pdfPath in pdfPathList:
    print("Working on file " + str(i) + " of " + str(len(pdfPathList)))
    ocrPdf(pdfPath, saveDir)
    i = i + 1
    
print("Done!")

Working on file 1 of 2
Working on file 2 of 2
