In [8]:
#!pip install fitz
#!pip install PyMuPDF

In [10]:
# Import Libraries
from typing import Tuple
from io import BytesIO
import os
import argparse
import re
import fitz

import pandas as pd

### settings xlsx

In [6]:
PROMO = 'ПРОМОКОД'

In [2]:
df_promo = pd.read_excel('settings.xlsx')
promo_list = list(df_promo['promocodes'])

In [3]:
promo_list

['abc', 'test', 'gfgdfsgdsgergsrg']

### read pdf

In [29]:
def extract_info(input_file: str):
    """
    Extracts file info
    """
    # Open the PDF
    pdfDoc = fitz.open(input_file)
    output = {
        "File": input_file, "Encrypted": ("True" if pdfDoc.isEncrypted else "False")
    }
    # If PDF is encrypted the file metadata cannot be extracted
    if not pdfDoc.isEncrypted:
        for key, value in pdfDoc.metadata.items():
            output[key] = value
    # To Display File Info
    print("## File Information ##################################################")
    print("\n".join("{}:{}".format(i, j) for i, j in output.items()))
    print("######################################################################")
    return True, output

def search_for_text(lines, search_str):
    """
    Search for the search string within the document lines
    """
    for line in lines:
        # Find all matches within one line
        #results = re.findall(search_str, line, re.IGNORECASE)
        results = re.findall(search_str, line)
        # In case multiple matches within one line
        for result in results:
            yield result

def redact_matching_data(page, matched_values, promo, style):
    """
    Redacts matching values
    """
    matches_found = 0
    # Loop throughout matching values
    for val in matched_values:
        matches_found += 1
        matching_val_area = page.search_for(val)
        # Redact matching values
#        [page.add_redact_annot(area, text=" ", fill=(0, 0, 0))
#        for area in matching_val_area]
        [
            page.add_redact_annot(area, text=promo, fill=False, **style)
            for area in matching_val_area
        ]

    # Apply the redaction
    page.apply_redactions()
    return matches_found


def frame_matching_data(page, matched_values):
    """
    frames matching values
    """
    matches_found = 0
    # Loop throughout matching values
    for val in matched_values:
        matches_found += 1
        matching_val_area = page.searchFor(val)
        for area in matching_val_area:
            if isinstance(area, fitz.fitz.Rect):
                # Draw a rectangle around matched values
                annot = page.addRectAnnot(area)
                # , fill = fitz.utils.getColor('black')
                annot.setColors(stroke=fitz.utils.getColor('red'))
                # If you want to remove matched data
                #page.addFreetextAnnot(area, ' ')
                annot.update()
    return matches_found



def process_data(input_file: str, output_file: str, search_str: str, pages: Tuple=None, action: str='Highlight', promo='', style={}):
    """
    Process the pages of the PDF File
    """
    # Open the PDF
    pdfDoc = fitz.open(input_file)
    #print(pdfDoc.page_count)
    # Save the generated PDF to memory buffer
    output_buffer = BytesIO()
    total_matches = 0
    # Iterate through pages
    for pg in range(pdfDoc.page_count):
        # If required for specific pages
        if pages:
            if str(pg) not in pages:
                continue
        # Select the page
        page = pdfDoc[pg]
        # Get Matching Data
        # Split page by lines
        page_lines = page.get_text("text").split('\n')
        matched_values = search_for_text(page_lines, search_str)
        if matched_values:
            if action == 'Redact':
                matches_found = redact_matching_data(page, matched_values, promo, style)
            elif action == 'Frame':
                matches_found = frame_matching_data(page, matched_values)
            elif action in ('Highlight', 'Squiggly', 'Underline', 'Strikeout'):
                matches_found = highlight_matching_data(
                    page, matched_values, action)
            else:
                matches_found = highlight_matching_data(
                    page, matched_values, 'Highlight')
            total_matches += matches_found
    print(f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}")
    # Save to output
    pdfDoc.save(output_buffer)
    pdfDoc.close()
    # Save the output buffer to the output file
    with open(output_file, mode='wb') as f:
        f.write(output_buffer.getbuffer())



def process_file(**kwargs):
    """
    To process one single file
    Redact, Frame, Highlight... one PDF File
    Remove Highlights from a single PDF File
    """
    input_file = kwargs.get('input_file')
    output_file = kwargs.get('output_file')
    
    # test promo
    promo = kwargs.get('promo')
    style = kwargs.get('style')
    
    if output_file is None:
        output_file = input_file
        
    search_str = kwargs.get('search_str')
    pages = kwargs.get('pages')
    
    # Redact, Frame, Highlight, Squiggly, Underline, Strikeout, Remove
    action = kwargs.get('action')
    
    if action == "Remove":
        # Remove the Highlights except Redactions
        remove_highlght(input_file=input_file,
                        output_file=output_file, pages=pages)
    else:
        process_data(input_file=input_file,
                     output_file=output_file,
                     search_str=search_str, 
                     pages=pages, 
                     action=action,
                     promo=promo,
                     style=style
                    )


def is_valid_path(path):
    """
    Validates the path inputted and checks whether it is a file path or a folder path
    """
    if not path:
        raise ValueError(f"Invalid Path")
    if os.path.isfile(path):
        return path
    elif os.path.isdir(path):
        return path
    else:
        raise ValueError(f"Invalid Path {path}")


In [31]:
template = 'template.pdf'
output = 'test.pdf'

Normal_style = dict(fontname="helv", fontsize=24)

extract_info(input_file=template)

process_file(
    input_file=template, 
    output_file=output,
    search_str='ПРОМОКОД',
    #pages=[0, 1, 2], 
    action='Redact',
    promo='test1123123',
    style=Normal_style
)

## File Information ##################################################
File:template.pdf
Encrypted:False
format:PDF 1.3
title:My New Title Goes Here
author:
subject:
keywords:
creator:Adobe Illustrator 26.1 (Windows)
producer:Adobe PDF library 16.04
creationDate:D:20220920123717+05'00'
modDate:D:20220920133448+04'00'
trapped:
encryption:None
######################################################################
1 Match(es) Found of Search String ПРОМОКОД In Input File: template.pdf


In [None]:
args = parse_args()
# If File Path
if os.path.isfile(args['input_path']):
    # Extracting File Info
    extract_info(input_file=args['input_path'])
    # Process a file
    process_file(
        input_file=args['input_path'], output_file=args['output_file'],
        search_str=args['search_str'] if 'search_str' in (args.keys()) else None,
        pages=args['pages'], action=args['action']
    )