In [1]:
import os
import csv
from pathlib import Path
import json

from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
import io

'/home/rian/ML_Scripts/PDF_OCR/digital_pdf_list.csv'
"/mnt/hgfs/Shared_VM/Envelopes/"

In [2]:
def create_file_list(path_to_list: str, path_to_docs: str)->list:
    """Create a list of file paths from a csv input list of files
    INPUT: 
    path_to_list: str: full path to csv file.
    path_to_docs: str: path to folder containing .pdf files for tect extraction
    
    RETURNS: list: list of Path objects to documents
    """

    files = []
    with open(path_to_list, newline='') as f:
        read = csv.reader(f)
        for row in read:
            files.append(row[0])
    paths = [Path(path_to_docs+pdf) for pdf in files]
    print(f"list contains {len(files)} for text extraction")
    return paths

test_path = create_file_list('/home/rian/ML_Scripts/PDF_OCR/test_envs.csv', "/mnt/hgfs/Shared_VM/Envelopes/")
test_path

In [3]:
def extract_text(fname):
    """Extract text as strings, page-by-page from a digital pdf document
    INPUT:
    fname: str: path to pdf document
    
    RETURNS: dict: python dictionary with page number, starting at 1:page text
    """
    output = io.StringIO()
    manager = PDFResourceManager()
    codec = 'utf-8'
    converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    
    page_text = {}
    with open(fname, 'rb') as infile:
        for (i, page) in enumerate(PDFPage.get_pages(infile)):
            interpreter.process_page(page)
            _text = output.getvalue()
            page_text[i+1] = _text
            output.seek(0)
            
    
    
    output.close()
    converter.close()
    
    return page_text

test_text = extract_text('/mnt/hgfs/Shared_VM/Envelopes/13270.pdf')
test_text[5]

def extract_text(fname, pages=None):
    """Extract text as strings, page-by-page from a digital pdf document
    INPUT:
    fname: str: path to pdf document
    
    RETURNS: dict: python dictionary with page number, starting at 1:page text
    """
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
        
    output = io.StringIO()
    manager = PDFResourceManager()
    codec = 'utf-8'
    converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    
    page_text = {}
    #with open(fname, 'rb') as infile:
    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
        #_text = output.getvalue()
        #page_text[i+1] = _text
    
    infile.close()
    converter.close()
    _text = output.getvalue()
    output.close()

    return _text

test_text = extract_text('/mnt/hgfs/Shared_VM/Envelopes/13270.pdf')
test_text

test_text[5]

In [4]:
def process_multiple_pdfs(path_to_list: str, path_to_docs: str):
    """Function to extract text from multiple digital pgf files and return both a json output file and python dictionary containing
    a structure; {report_no: {page_no:text}}
    INPUT:
    path_to_list: str: full path to csv file.
    path_to_docs: str: path to folder containing .pdf files for tect extraction

    RETURNS: None: outputs json file to curent working directory file path
    """
    path_list = create_file_list(path_to_list, path_to_docs)

    digital_text = {}
    for pdf in path_list:
        try:
            text = extract_text(pdf)
            digital_text[pdf.stem] = text
            print(f"text extracted from {pdf.stem}")
        except Exception as e:
            print(f"Error {e} in file {pdf.stem}")
            continue
    
    #print(f"Json output to: {os.getcwd()}")
    #with open("digital_text_extract.json", "w") as outfile:
        #json.dump(digital_text, outfile, indent=' ')

    return digital_text

if __name__=="__main__":
    process_multiple_pdfs('/home/rian/ML_Scripts/PDF_OCR/test_envs.csv', "/mnt/hgfs/Shared_VM/Envelopes/")

In [5]:
texts = process_multiple_pdfs('/home/rian/ML_Scripts/PDF_OCR/test_envs.csv', "/mnt/hgfs/Shared_VM/Envelopes/")

list contains 6 for text extraction
text extracted from 06557
text extracted from 06604
text extracted from 11141
text extracted from 11148
text extracted from 13267
text extracted from 13270


In [6]:
texts.keys()

dict_keys(['06557', '06604', '11141', '11148', '13267', '13270'])

In [7]:
texts['13270']

rms  part  of  the  larger  Nawa  Domain,  is  located  in  the  central \nnorthern Gawler, and is bound by the Karari Fault Zone to the south. The geology and temporal evolution \nof the region is poorly known due to a lack of outcrop. Sparse drilling from within FMG’s EL 6219 shows \nevidence  of  low  temperature  hematite-carbonate-amphibole-sulphide  alteration  and  brecciation, \ntherefore the style of mineralisation  targeted  within the Mabel  Creek Ridge is similar  to the hematite \ndominant IOCG systems on the Stuart Shelf (e.g. Olympic Dam, Carrapateena & Prominent Hill).  \n \nRegional  drill  hole  data  within  the  MCR  suggests  the  area  is  largely  comprised  of  ca.  1750-1730  Ma \nmetasedimentary rocks affected by ca. 1720-1690 Ma and ca. 1590-1550 Ma metamorphism (Payne et \nal.,  2006,  2008  &  Cutts  et  al.,  2011).  In  general,  basement  lithologies  intersected  in  drilling  include \ngranulite facies metabasites,  pelites,  psammites and iron rich me