### PyPDF Tutorial

#### Installing
+ pip install pypdf2

#### Task
+ How to Extract text from PDF file using PyPDF2
+ How to merge and split PDF files
+ How to get the last page of PDF
+ How to Rotate PDF
+ How to extract images from pdf
+ How to extract tables from pdf
+ How to convert image to pdf
+ How to create pdf from text file
+ How to Encrypt a PDF file using PyPDF2
+ How to add a watermark to a PDF file

#### List of Useful PDF Pkgs
+ PyPDF2
+ fpdf
+ pdfminer
+ pdfblumber
+ img2pdf
+ tabular
+ camelot.py
+ pdflatex
+ reportlab
+ etc

In [1]:
# Load the packages
import PyPDF2 as pdf

In [2]:
# Check version
pdf.__version__

'3.0.1'

In [3]:
# Methods
dir(pdf)

['DocumentInformation',
 'PageObject',
 'PageRange',
 'PaperSize',
 'PasswordType',
 'PdfFileMerger',
 'PdfFileReader',
 'PdfFileWriter',
 'PdfMerger',
 'PdfReader',
 'PdfWriter',
 'Transformation',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_cmap',
 '_codecs',
 '_encryption',
 '_merger',
 '_page',
 '_protocols',
 '_reader',
 '_security',
 '_utils',
 '_version',
 '_writer',
 'constants',
 'errors',
 'filters',
 'generic',
 'pagerange',
 'papersizes',
 'parse_filename_page_ranges',
 'types',
 'xmp']

#### How to Read a PDF File in Python
With PyPDF2 you can open and read a pdf file.
#### NB: The ..File.. class has been deprecated
+ eg use PdfReader PdfFileReader 

In [4]:
from PyPDF2 import PdfReader, PdfWriter

In [5]:
# Get Info of Open PDF File
file = open("Nativity_Example.pdf","rb")
reader = PdfReader(file)

In [6]:
reader.getDocumentInfo()

DeprecationError: getDocumentInfo is deprecated and was removed in PyPDF2 3.0.0. Use metadata instead.

In [7]:
# Get the Title of the PDF Document
# prev reader.getDocumentInfo
info = reader.metadata

In [8]:
info

{'/Title': 'Nativity_Example',
 '/Producer': 'Skia/PDF m110 Google Docs Renderer'}

In [9]:
# Get the title
info.title

'Nativity_Example'

In [10]:
# AUthor
info.author

In [11]:
# How to get number of pages
# reader.getNumPages() ## prev 
len(reader.pages)

3

In [12]:
# How to Extract the text
# prev reader.pages[0].extractText()
reader.pages[0].extract_text()

'The nativity of Jesus, nativity of Christ, birth of Jesus or birth of Christ is described in the biblical\ngospels of Luke and Matthew . The two accounts agree that Jesus was born in Bethlehem in\nJudaea, his mother Mary was engaged to a man named Joseph, who was descended from King\nDavid and was not his biological father , and that his birth was caused by divine intervention.\nMany modern scholars consider the birth narratives unhistorical because they are laced with\ntheology and present two dif ferent accounts which cannot be harmonised into a single coherent\nnarrative. But many others view the discussion of historicity as secondary , given that gospels\nwere primarily written as theological documents rather than chronological timelines.\nThe nativity is the basis for the Christian holiday of Christmas, and plays a major role in the\nChristian liturgical year . Many Christians traditionally display small manger scenes depicting the\nnativity in their homes, or attend nativity pla

In [13]:
### A Function To get Meta Data
def get_pdf_metadata(pdf_path):
    with open(pdf_path, "rb") as f:
        reader = PdfReader(f)
        info = reader.metadata
    return info

In [14]:
get_pdf_metadata("Nativity_Example.pdf")

{'/Title': 'Nativity_Example',
 '/Producer': 'Skia/PDF m110 Google Docs Renderer'}

In [15]:
### Function to Extract Text From PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path,"rb") as f:
        reader = PdfReader(f)
        results = []
        for i in range(0,len(reader.pages)): # prev read.getNumPages()
            selected_page = reader.pages[i]
            text = selected_page.extract_text()
            results.append(text)
        return ' '.join(results) # convert list to a single doc
        
            

In [16]:
extract_text_from_pdf("Nativity_Example.pdf")

'The nativity of Jesus, nativity of Christ, birth of Jesus or birth of Christ is described in the biblical\ngospels of Luke and Matthew . The two accounts agree that Jesus was born in Bethlehem in\nJudaea, his mother Mary was engaged to a man named Joseph, who was descended from King\nDavid and was not his biological father , and that his birth was caused by divine intervention.\nMany modern scholars consider the birth narratives unhistorical because they are laced with\ntheology and present two dif ferent accounts which cannot be harmonised into a single coherent\nnarrative. But many others view the discussion of historicity as secondary , given that gospels\nwere primarily written as theological documents rather than chronological timelines.\nThe nativity is the basis for the Christian holiday of Christmas, and plays a major role in the\nChristian liturgical year . Many Christians traditionally display small manger scenes depicting the\nnativity in their homes, or attend nativity pla

#### How to Split PDFs
+ Split into multiple pdfs
+ Split off the last page
+ Get PDF upto a Particular Page

In [17]:
# Function to Split PDF into Multiple PDF Pages
import os
def split_pdf(pdf_path):
    with open(pdf_path,"rb") as f:
        reader = PdfReader(f)
        # get all pages
        for page_num in range(0, len(reader.pages)): # loop through pages
            selected_page = reader.pages[page_num]
            # Writer to write
            writer = PdfWriter()
            writer.add_page(selected_page) # add/embedding of the page
            filename = os.path.splitext(pdf_path)[0]
            output_filename = f"{filename}_page_{page_num + 1}.pdf"
            # save and compile to pdf
            with open(output_filename,"wb") as out:
                writer.write(out)
                
            print("created a pdf:{}".format(output_filename))
            

In [18]:
split_pdf("Nativity_Example.pdf")

created a pdf:Nativity_Example_page_1.pdf
created a pdf:Nativity_Example_page_2.pdf
created a pdf:Nativity_Example_page_3.pdf


In [19]:
# Split PDF Upto A Page
def get_pdf_upto(pdf_path,start_page:int=0,stop_page: int = 0):
    with open(pdf_path,"rb") as f:
        reader = PdfReader(f)
        writer = PdfWriter()
        for page_num in range(start_page,stop_page):
            selected_page = reader.pages[page_num]
            writer.add_page(selected_page) # prev ::  addPage()
            filename = os.path.splitext(pdf_path)[0]
            output_filename = f"{filename}_from_{start_page}_to_{stop_page}.pdf"
        with open(output_filename, "wb") as out:
            writer.write(out)
            

In [20]:
get_pdf_upto("Nativity_Example.pdf",0,2)

In [21]:
get_pdf_upto("Nativity_Example.pdf",1,2)

In [22]:
### How to get the last page of PDF
def get_last_pdf_page(pdf_path):
     with open(pdf_path,"rb") as f:
        reader = PdfReader(f)
        writer = PdfWriter()
        selected_page = reader.pages[len(reader.pages)-1]
        writer.add_page(selected_page)
        filename = os.path.splitext(pdf_path)[0]
        output_filename = f"{filename}_last_page.pdf"
        with open(output_filename, "wb") as out:
            writer.write(out)
        print("created last page")

In [23]:
get_last_pdf_page("Nativity_Example.pdf")

created last page


#### Merging PDFs 
+ get a list of pdfs
+ PdfMerger (PdfFileMerger)

In [24]:
def fetch_all_pdf_files(parent_folder: str):
    target_files = []
    for path, subdirs, files in os.walk(parent_folder):
        for name in files:
            if name.endswith(".pdf"):
                target_files.append(os.path.join(path, name))
    return target_files 

In [25]:
fetch_all_pdf_files("./Out")

[]

In [26]:
from PyPDF2 import PdfMerger

In [27]:
def merge_pdf(list_of_pdfs,output_filename="final_merged_file.pdf"):
    merger = PdfMerger()
    with open(output_filename,"wb") as f:
        for file in list_of_pdfs:
            merger.append(file)
        merger.write(f)

In [28]:
pdf_list = fetch_all_pdf_files("./Out")

In [29]:
merge_pdf(pdf_list)

### How to Rotate A PDF Page

In [30]:
def rotate_pdf(pdf_path,page_num:int, rotation: int = 90):
    with open(pdf_path,"rb") as f:
        reader = PdfReader(f)
        writer = PdfWriter()
        writer.add_page(reader.pages[page_num])
        # rotate
        writer.pages[page_num].rotate(rotation)
        filename = os.path.splitext(pdf_path)[0]
        output_filename = f"{filename}_{rotation}_rotated_page.pdf"
        with open(output_filename, "wb") as out:
            writer.write(out)
        print("rotated page")
        

In [31]:
rotate_pdf("Nativity_Example_last_page.pdf",0)

rotated page


In [32]:
rotate_pdf("Nativity_Example_last_page.pdf",0,180)

rotated page


#### Working with Images in PDF
+ How to extract images from pdf
+ How to convert images to pdf

In [33]:
from PyPDF2 import PdfReader

In [34]:
def extract_images_from_pdf(pdf_path):
    with open(pdf_path,"rb") as f:
        reader = PdfReader(f)
        for page_num in range(0,len(reader.pages)):
            selected_page = reader.pages[page_num]
            for img_file_obj in selected_page.images:
                with open(img_file_obj.name, "wb") as out:
                    out.write(img_file_obj.data)
                    
                
        

In [35]:
extract_images_from_pdf("ML_Wiki.pdf")

#### How to Convert Img to PDF

In [36]:
from PIL import Image

In [37]:
def convert_img2pdf(image_file):
    my_image = Image.open(image_file)
    img = my_image.convert("RGB")
    filename = f"{os.path.splitext(image_file)[0]}.pdf"
    img.save(filename)

In [38]:
convert_img2pdf("X8.png")