# Extracting Metadata From PDFs

In [1]:
from PyPDF2 import PdfFileReader
def get_info(path):
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        info = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
    print(info)
    author = info.author
    creator = info.creator
    producer = info.producer
    subject = info.subject
    title = info.title
if __name__ == '__main__':
    path = 'sample.pdf'
    get_info(path)

{'/Creator': 'Rave (http://www.nevrona.com/rave)', '/Producer': 'Nevrona Designs', '/CreationDate': 'D:20060301072826'}


# Extracting Text From PDFs

In [2]:
from PyPDF2 import PdfFileReader
def text_extractor(path):
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        # get the first page
        page = pdf.getPage(1)
        print(page)
        print('Page type: {}'.format(str(type(page))))
        text = page.extractText()
        print(text)
if __name__ == '__main__':
    path = 'sample.pdf'
    text_extractor(path)

{'/Type': '/Page', '/Parent': IndirectObject(3, 0), '/Resources': {'/Font': {'/F1': IndirectObject(9, 0)}, '/ProcSet': IndirectObject(8, 0)}, '/MediaBox': [0, 0, 612, 792], '/Contents': IndirectObject(7, 0)}
Page type: <class 'PyPDF2.pdf.PageObject'>
 Simple PDF File 2  ...continued from page 1. Yet more text. And more text. And more text.  And more text. And more text. And more text. And more text. And more  text. Oh, how boring typing this stuff. But not as boring as watching  paint dry. And more text. And more text. And more text. And more text.  Boring.  More, a little more text. The end, and just as well. 


# Splitting PDFs

In [3]:
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def pdf_splitter(path):
    fname = os.path.splitext(os.path.basename(path))[0]
    pdf = PdfFileReader(path)
    for page in range(pdf.getNumPages()):
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(page))
        output_filename = '{}_page_{}.pdf'.format(
            fname, page+1)
        with open(output_filename, 'wb') as out:
            pdf_writer.write(out)
        print('Created: {}'.format(output_filename))
if __name__ == '__main__':
    path = 'sample.pdf'
    pdf_splitter(path)

Created: sample_page_1.pdf
Created: sample_page_2.pdf


# Merging Multiple PDFs Together

In [4]:
import glob
from PyPDF2 import PdfFileWriter, PdfFileReader
def merger(output_path, input_paths):
    pdf_writer = PdfFileWriter()
    for path in input_paths:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page))
    with open(output_path, 'wb') as fh:
        pdf_writer.write(fh)
if __name__ == '__main__':
    paths = glob.glob('sample_*.pdf')
    paths.sort()
    merger('pdf_merger.pdf', paths)

# By using PdfFileMerger class

In [5]:
import glob
from PyPDF2 import PdfFileMerger
def merger(output_path, input_paths):
    pdf_merger = PdfFileMerger()
    file_handles = []
    for path in input_paths:
        pdf_merger.append(path)
    with open(output_path, 'wb') as fileobj:
        pdf_merger.write(fileobj)
if __name__ == '__main__':
    paths = glob.glob('sample_*.pdf')
    paths.sort()
    merger('pdf_merger2.pdf', paths)

# Rotating Pages

In [7]:
from PyPDF2 import PdfFileWriter, PdfFileReader
def rotator(path):
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(path)
    page1 = pdf_reader.getPage(0).rotateClockwise(90)
    pdf_writer.addPage(page1)
    page2 = pdf_reader.getPage(1).rotateCounterClockwise(90)
    pdf_writer.addPage(page2)
    pdf_writer.addPage(pdf_reader.getPage(1))
    with open('pdf_rotator.pdf', 'wb') as fh:
        pdf_writer.write(fh)
if __name__ == '__main__':
    rotator('sample.pdf')

# Overlaying/Watermarking Pages

In [9]:
from PyPDF2 import PdfFileWriter, PdfFileReader
def watermark(input_pdf, output_pdf, watermark_pdf):
    watermark = PdfFileReader(watermark_pdf)
    watermark_page = watermark.getPage(0)
    pdf = PdfFileReader(input_pdf)
    pdf_writer = PdfFileWriter()
    for page in range(pdf.getNumPages()):
        pdf_page = pdf.getPage(page)
        pdf_page.mergePage(watermark_page)
        pdf_writer.addPage(pdf_page)
    with open(output_pdf, 'wb') as fh:
        pdf_writer.write(fh)
if __name__ == '__main__':
    watermark(input_pdf='sample.pdf', 
              output_pdf='watermarked_sample.pdf',
              watermark_pdf='watermark.pdf')

# PDF Encryption

In [11]:
from PyPDF2 import PdfFileWriter, PdfFileReader
def encrypt(input_pdf, output_pdf, password):
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(input_pdf)
    for page in range(pdf_reader.getNumPages()):
        pdf_writer.addPage(pdf_reader.getPage(page))
    pdf_writer.encrypt(user_pwd=password, owner_pwd=None, 
                       use_128bit=True)
    with open(output_pdf, 'wb') as fh:
        pdf_writer.write(fh)
if __name__ == '__main__':
    encrypt(input_pdf='sample.pdf',
            output_pdf='encrypted.pdf',
            password='pdfpass')