# **PDF PROCESSING WITH PYTHON**

# **PyPDF2**

In [None]:
# Importing pyPDF2 library
import PyPDF2

In [None]:
# Open PDF Files in binary mode

# paths to pdf files
path1 = "/content/sample_data/pdf_sample1.pdf"
path2 = "/content/sample_data/pdf_sample2.pdf"

# open the as file objects.
pdf_sample1_obj = open(path1, "rb")
pdf_sample2_obj = open(path2, "rb")

In [None]:
# Create an object of PdfFileReader class of PyPDF2 module and
# pass the PDF file object & get a PDF reader object.
pdfReader_sample1 = PyPDF2.PdfFileReader(pdf_sample1_obj)

# Checking type of pdfReader_sample
print("type: ", type(pdfReader_sample1))

type:  <class 'PyPDF2._reader.PdfFileReader'>


In [None]:
# Get number of pages in pdf file.
sample1_page_count = pdfReader_sample1.numPages
print("Number of Pages: ", sample1_page_count)

Number of Pages:  4


In [None]:
# Getting Extra Details.

print("PDF Sample Extra Details:")
pdfReader_sample1.documentInfo

PDF Sample Extra Details:


{'/Author': 'pankaj chauhan',
 '/CreationDate': "D:20220606170745+05'30'",
 '/Creator': 'Microsoft® Word 2016',
 '/ModDate': "D:20220606170745+05'30'",
 '/Producer': 'Microsoft® Word 2016'}

In [None]:
# Getting Single page from PDF Reader Object

# Create an object of PageObject class of PyPDF2 module
# PDF reader object has function getPage() 
# which takes page number (starting from index 0) as argument and returns the page object.
pageObj_sample1 = pdfReader_sample1.getPage(0)
print("Type: ", type(pageObj_sample1))

Type:  <class 'PyPDF2._page.PageObject'>


In [None]:
# Extracting texts from page object.
page_texts_sample1 = pageObj_sample1.extractText()
print("Texts in sample pdf file: \n", page_texts_sample1)

Texts in sample pdf file: 
  
1
 
| 
P a g e
 
 
Chapter 1
 
GIT
 
asDistributed Version Control system
 
 
1.1.
GIT
 
GIT
 
isthe most popular 
Version Control System
 
nowadays becauseitis an open
-
source software 
whichis easy to handle and 
perform work on various projects. 
GIT Version Control System
 
(
GIT 
VERSION CONTROL SYSTEM
)is asystem that manages thedevelopment of an evolving object. In 
other words, it is asystem thatrecordsany changes made by the software developers.There are al
ot 
of uses for 
GIT VERSIONCONTROL SYSTEM
 
insoftware development that makes the 
development process easier and faster. 
GITVERSION CONTROL SYSTEM
 
is also known as 
Revision Control System.
 
 
In the software development process, itis normal for software developers to continuallymake 
changes in pieces ofcodesand otherfiles thatinvolve addition anddeletion of afeature. It isrealized 
that severalrevisions will be made before producing th
e final version. It is difficult to manage and 
or

In [None]:
# Saving PDF File.

def save_pdf(pdfWriter_obj, output_path):
  # create a file object
  file_obj = open(output_path, 'wb')
  
  # writing pdfWriter object to new file
  pdfWriter_obj.write(file_obj)
  
  # Once file is saved we can close file object.
  file_obj.close()
  print("File saved succesfuly as: ", output_path)


In [None]:
# Rotating PDF Pages:

# We already have PdfFileReader object so we can iterate to all pages and
# append each page after rotation to new PDF object.

# angle of rotation should be in multiple of 90)

def rotate_pdf(pdfReader_obj, r_angle):
  """Input: (PyPDF2 Reader Object, rotation angle)
     Output: PyPDF2 Writer Object.
  """
  
  # creating a pdf writer object for new pdf
  pdfWriter_obj = PyPDF2.PdfFileWriter()
  
  # rotating each page
  for page in range(pdfReader_obj.numPages):
    
    # creating rotated page object
    pageObj = pdfReader_obj.getPage(page)
    
    # rotating page
    pageObj.rotateClockwise(r_angle)
    
    # adding rotated page object to pdf writer
    pdfWriter_obj.addPage(pageObj)

  return pdfWriter_obj


# Function Calls.
if __name__ == "__main__":
  print("Rotating PDF file.")
  rotated_pdf = rotate_pdf(pdfReader_sample1, 90)
  
  # save file.
  output_path = path1.replace(".pdf", "_rotated.pdf")
  save_pdf(rotated_pdf, output_path)

Rotating PDF file.
File saved succesfuly as:  /content/sample_data/pdf_sample1_rotated.pdf


In [None]:
# Merging PDF Files:

def merge_pdf(pdf_to_merge):
  """Input: (list of pdf paths to merge)
     Output: PyPDF2  Merge Object.
  """
  # creating pdf file merger object
  pdfMerger = PyPDF2.PdfFileMerger()
  
  # appending pdfs one by one
  for pdf in pdf_to_merge:
    pdfMerger.append(pdf)

  # returning PyPDF2 Merger Object.
  return pdfMerger


# Function Calls.
if __name__ == "__main__":
  print("Merging PDF file.")
  
  # For merging we are using two pdf samples but more also.
  # pdf_sample1.pdf && pdf_sample2.pdf
  merged_pdf = merge_pdf([path1, path2])

  # save file.
  output_path = path1.replace(".pdf", "_merged.pdf")
  save_pdf(merged_pdf, output_path)


Merging PDF file.
File saved succesfuly as:  /content/sample_data/pdf_sample1_merged.pdf


In [None]:
# Splitting PDF File.
# ----------------------------------------------------------------

def split_pdf(pdfReader_obj, output_dir):
  """Input: (PyPDF2 Reader object, output directory to save)
     Output: None.
  """
  for page in range(pdfReader_obj.getNumPages()):
    
    # Create a pdf writer object.
    pdf_writer = PyPDF2.PdfFileWriter() 
   
    # get single page from PDF reader object and append to writer object.
    pdf_writer.addPage(pdfReader_obj.getPage(page))
  
    # new file_name
    output_file_name = output_dir + "/split_" + str(page) + '.pdf'
  
    # saving new PDF file
    with open(output_file_name, 'wb') as output_pdf:
        pdf_writer.write(output_pdf, )


# Function Calls.
if __name__ == "__main__":
  print("Spliting PDF file.")
  split_pdf(pdfReader_sample1, "/content/sample_data")

Spliting PDF file.


In [None]:
# Encrypting PDF File:

def encrypt_pdf(pdfReader_obj, user_password="",
                owner_password="", use_128bit=False):
  """Input: (PyPDF2 Reader object, user password,
             owner password, True if want to use 128 bit encryption
             else 40 bit is used.)
     Output: PyPDF2 Writer Object.
  """
  # create a pdf writer object
  pdf_writer = PyPDF2.PdfFileWriter()
  # Now iterate through each page and append to pdf writer object
  for page in range(pdfReader_obj.getNumPages()):
    pdf_writer.addPage(pdfReader_obj.getPage(page))

  # Now we can encrypt pdf file.
  # .encrypt(), takes the user password, the owner password, and
  # whether or not 128-bit encryption should be added. 
  # The default is for 128-bit encryption to be turned on. 
  # If you set it to False, then 40-bit encryption will be applied instead.    
  pdf_writer.encrypt(user_pwd=user_password, 
                     owner_pwd=owner_password, use_128bit=use_128bit)
  
  return pdf_writer


# Function Calls.
if __name__ == "__main__":
  print("Encrypting PDF file.")

  # For encryption, We will use pdf_sample1 file.
  # Define password for encryption
  user_password = "password@1234"

  e_pdfWriter = encrypt_pdf(pdfReader_sample1,
                            user_password=user_password,
                            owner_password="",
                            use_128bit=True)

  # save file.
  output_path = path1.replace(".pdf", "_encrypted.pdf")
  save_pdf(e_pdfWriter, output_path)



Encrypting PDF file.
File saved succesfuly as:  /content/sample_data/pdf_sample1_encrypted.pdf


In [None]:
# Decrypting PDF File.

def decrypt_pdf(pdfReader_obj, decrypt_password):
  """Input: (PyPDF2 Reader object, decrypt password)
     Output: PyPDF2 Writer Object.
  """
  # Create a PdfFileWriter object
  pdfWriter_decrypt = PyPDF2.PdfFileWriter()
  
  # Check if the opened file is actually Encrypted
  if pdfReader_obj.isEncrypted:
    print("File is encrypted")
    # If encrypted, decrypt it with the password
    pdfReader_obj.decrypt(decrypt_password)

    # Now, the file has been unlocked.
    # Iterate through every page of the file
    # and add it to our new file.
    for page in range(pdfReader_obj.numPages):
      # Get the page at index page
      page = pdfReader_obj.getPage(page)
      
      # Add it to the output file
      pdfWriter_decrypt.addPage(page)
    
  # return decrypted pdfWriter object.
  return pdfWriter_decrypt


if __name__ == "__main__":
  print("Derypting PDF file.")

  # We will decrypt encrypted pdf which we encrypted just above.
  user_password = "password@1234"
  inp_path = "/content/sample_data/pdf_sample1_encrypted.pdf"

  # create PyPDF2 Reader Object.
  encrypted_pdf = PyPDF2.PdfReader(open(inp_path, "rb"))

  d_pdfWriter = decrypt_pdf(encrypted_pdf, user_password)

  # save file.
  output_path = path1.replace(".pdf", "_decrypted.pdf")
  save_pdf(d_pdfWriter, output_path)

Derypting PDF file.
File is encrypted
File saved succesfuly as:  /content/sample_data/pdf_sample1_decrypted.pdf


In [None]:
# Cropping PDF Files
def crop_pdf(pdfReader_obj):
  """Input: PyPDF2 Reader Object
     Output: PyPDF2 Writer Object
  """
  print("Cropping PDF File.")

  # create pdfWriter Object.
  output_obj = PyPDF2.PdfFileWriter()

  # Iterate through pages and Crop all pages
  for i in range(pdfReader_obj.numPages):
    page = pdfReader_obj.getPage(i)

    # cropping page
    page.trimBox.lowerLeft = (25, 25)
    page.trimBox.upperRight = (225, 225)
    page.cropBox.lowerLeft = (50, 50)
    page.cropBox.upperRight = (200, 200)

    # Finally append page.
    output_obj.addPage(page)
    
  # returning pdfWriter Object.
  return output_obj


if __name__ == "__main__":
  print("Cropping PDF file.")

  # Function call to crop pdf file.
  cropped_pdfWriter_obj = crop_pdf(pdfReader_sample1)

  # save file.
  output_path = path1.replace(".pdf", "_cropped.pdf")
  save_pdf(cropped_pdfWriter_obj, output_path)
  
  print("PDF file is cropped.")

Cropping PDF file.
Cropping PDF File.
File saved succesfuly as:  /content/sample_data/pdf_sample1_cropped.pdf
PDF file is cropped.


In [None]:
# Scaling PDF Files.

def scale_pdf(pdfReader_obj, scale_factor):
  """Input: (PyPDF2 Reader Obj, scale factor)
     Output: PyPDF Writer Obj
  """
  # create pdfWriter Object.
  output_obj = PyPDF2.PdfFileWriter()

  for i in range(pdfReader_obj.numPages):
    page = pdfReader_obj.getPage(i)
    page.scaleBy(scale_factor) 

    # Finally append page.
    output_obj.addPage(page)
    
  # returning pdfWriter Object.
  return output_obj


if __name__ == "__main__":
  print("Scaling PDF file.")

  # Function call to scale pdf file.
  scale_pdfWriter_obj = scale_pdf(pdfReader_sample1, 1)
  print("PDF file is scaled.")

  # save file.
  output_path = path1.replace(".pdf", "_scaled.pdf")
  save_pdf(scale_pdfWriter_obj, output_path)




Scaling PDF file.
PDF file is scaled.
File saved succesfuly as:  /content/sample_data/pdf_sample1_scaled.pdf


In [None]:
# Removing Images

def remove_images(pdfReader_obj, ):
  """Input: (PyPDF2 Reader Obj)
     Output: PyPDF Writer Obj
  """
  # create pdfWriter Object.
  output_obj = PyPDF2.PdfFileWriter()

  for i in range(pdfReader_obj.numPages):
    page = pdfReader_obj.getPage(i)

    # Finally append page.
    output_obj.addPage(page)
  
  # removing all images.
  output_obj.remove_images()
    
  # returning pdfWriter Object.
  return output_obj

# Function call to remove images from pdf file.
cropped_pdfWriter_obj = remove_images(pdfReader_sample1)
print("Images Removed from PDF File.")

Images Removed from PDF File.


In [None]:
# Removing Images

def remove_images(pdfReader_obj):
  """Input: (PyPDF2 Reader Obj)
     Output: PyPDF Writer Obj
  """
  # create pdfWriter Object.
  output_obj = PyPDF2.PdfFileWriter()

  for i in range(pdfReader_obj.numPages):
    page = pdfReader_obj.getPage(i)

    # Finally append page.
    output_obj.addPage(page)
  
  # removing all images.
  output_obj.remove_images()
    
  # returning pdfWriter Object.
  return output_obj

# Function call to remove images from pdf file.
rmi_pdfWriter_obj = remove_images(pdfReader_sample1)
print("Images Removed from PDF File.")

Images Removed from PDF File.


In [None]:
# Compressing Content.

def compress_pdf(pdfReader_obj):
  """Input: (PyPDF2 Reader Obj)
     Output: PyPDF Writer Obj
  """
  # create pdfWriter Object.
  output_obj = PyPDF2.PdfFileWriter()

  for i in range(pdfReader_obj.numPages):
    page = pdfReader_obj.getPage(i)

    # compressing page.
    page.compress_content_streams()

    # Finally append page.
    output_obj.addPage(page)
    
  # returning pdfWriter Object.
  return output_obj

# Function call to compress pdf file.
compressed_pdfWriter_obj = compress_pdf(pdfReader_sample1)
print("PDF File compressed.")

PDF File compressed.


There are many more functions in PyPDF2 for PDF Processing.
Please go through official PyPDF2 Docs for more understanding.

In [None]:
###################################################################################################