### PDF to plaintext

Task: Demonstrate how we can extract plain-text content from PDF files. Useful for processing Climate Watch, NDC pdfs (e.g. as can be [downloaded from CAIT](http://cait.wri.org/indc/)). Examples of these pdfs are contained in this repo in the *NDF_pdfs* subfolder.

We will use the [pdfminer](http://euske.github.io/pdfminer/index.html), a pure Python 2.7 (only) library.

Note. I also investigated [PyPDF2](https://pythonhosted.org/PyPDF2/PdfFileReader.html) but found it produced poorer results.

In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pprint import pprint
from cStringIO import StringIO
from tqdm import tqdm

In [2]:
def clean_block(text):
    if '\r' in text:
        test = ''.join(text.split('\r'))
    if '\t' in test:    
        text = ''.join(text.split('\t'))
    if '\n' in test:
        text = ''.join(text.split('\n'))
    return text

def convert_pdf_to_txt(path, clean=False):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    text = retstr.getvalue()
    retstr.close()
    if clean:
        text = clean_block(text)
    # append utf-8 tag
    utf_tag = "<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>"
    text=''.join([utf_tag, text])
    # replace crappy ’ character with '
    text=text.replace("’","'")
    # replace \xef\x82\xb7 which renders as , with the asterisk unicode character (*)
    text=text.replace("\xef\x82\xb7", "*")
    text=text.replace("\xef\x82\x9f", "*")
    # replace \xe2\x80\x93 which renders as a dash (-)
    text=text.replace("\xe2\x80\x93","-")
    # replace \xe2\x80\x9c (“) and \xe2\x80\x9d with escaped double-quote /"
    text=text.replace("\xe2\x80\x9c","\"")
    text=text.replace("\xe2\x80\x9d","\"")
    # remove \x0c
    text=text.replace("\x0c","")
    # Remove the page numbers (be careful here this doesnt affect tables...)
    pg = 1
    while "\n\n{0} \n".format(pg) in text:
        page_number="\n\n{0} \n".format(pg)
        #print(page_number)
        #print(page_number in text)
        text=text.replace(page_number,"")
        pg+=1
    return text

In [7]:
fname = "./pdfs/NDC/iNDC Per\xc2\xa3 castellano.pdf"
test = convert_pdf_to_txt(fname, clean=False)

In [9]:
#test

Save the plain text to a file as follows.

In [None]:
with open("./pdfs/test.txt", "w") as text_file:
    text_file.write(test)

### Looping over files

To process multiple files, you can identify the target files, and process them all using the following.
After which, you can add Markdown style, and then convert to html using pandoc via `pandoc input.md -o output.html`

In [3]:
import os

In [33]:
pdf_directory = './pdfs/INDC/'
files = [ filename for filename in os.listdir(pdf_directory) if filename.endswith('.pdf')]

In [34]:
if not os.path.exists(pdf_directory + 'text_files/'):
    os.mkdir(pdf_directory + 'text_files/')
for fname in tqdm(files):
    target = ''.join([pdf_directory,'/', fname])
    text = convert_pdf_to_txt(path=target, clean=False)
    output_file=fname.replace(".pdf",'.md')
    with open(pdf_directory + 'text_files/' + output_file, "w") as text_file:
        text_file.write(text)

100%|██████████| 206/206 [10:00<00:00,  2.12s/it]
