In [1]:
import fitz
import re
import os 
import pandas as pd
import numpy as np
import warnings
from tkinter import *
import sys
from tkinter import filedialog
from PIL import Image

In [2]:
warnings.filterwarnings('ignore')

# DEFINE FUNCTION

## COMBINE FUNCTION

In [3]:
def Combine_document(files_list:list,
                     output_filename:str,
                     save_location:str = r'C:\Users\trilnd\Desktop\QBO\File PDF\Checks_to_combine\Combine\Output'):
    """
    files_list is group of file user chose before with type is List
    output_filename is name of combined filename the user will name 
    save_location is Location include combine-pdf file
    """
    info_dict = dict()
    try:
        # Connect with files_list and check whether file is PDF or not
        for i in range(len(files_list)):
            file = files_list[i]

            # Get filepath and extension
            f,e = os.path.splitext(file)
            if e[1:].lower() != 'pdf':

                # If file is PNG or JPEG will conv
                img = Image.open(file)
                img = img.convert(mode='RGB')
                path_pdf_file = f'{f}.pdf'
                img.save(path_pdf_file,save_all=True)
                print(path_pdf_file)

                # update files_list:
                files_list[i] = path_pdf_file

        # Add all files to summary file 
        blank_doc = fitz.open()
        for file in files_list: 

            # Parse file PDF
            document = fitz.open(filename=file)
            base_name = os.path.basename(file)[:-4]
            # Insert document
            blank_doc.insert_pdf(docsrc=document)

        # Save the summaried file to PDF file
        combine_file_name = f'combine_{output_filename}.pdf'
        location = os.path.join(save_location,combine_file_name)

        # Store new document in the relevant location
        blank_doc.save(filename=location)

         # Save info of doc to dict
        info_dict['Total_file'] = len(files_list)
        info_dict['combine_filename'] = combine_file_name
        info_dict['page_num'] = blank_doc.page_count
        info_dict['Save_location'] = save_location
        return info_dict
    
    except Exception as e:
        info_dict['Error'] = e
        return info_dict

## Testing

In [4]:
root = Tk()
desktop_path = os.path.join(os.environ['USERPROFILE'], 'Desktop')
files_list = list(filedialog.askopenfilenames(parent=root,
                                                initialdir=desktop_path,
                                                title='Choose a file'))
path = "{}".format('\n'.join([os.path.basename(file) for file in files_list]))
root.mainloop()

In [5]:
[os.path.basename(file) for file in files_list]

['1880.pdf', '1892.pdf', '1895.pdf', '1899.pdf', '1900.pdf']

In [13]:
[os.path.isfile(file) for file in files_list]

[True, True, True, True, True]

In [7]:
print(path)

1880.pdf
1892.pdf
1895.pdf
1899.pdf
1900.pdf


In [10]:
result = Combine_document(files_list=files_list,
                          output_filename='test_jupyterlab',
                          save_location=r'C:\Users\trilnd\Desktop\QBO\File PDF\Split_Combine_PDF\Combine\Output')

In [11]:
result

{'Total_file': 5,
 'combine_filename': 'combine_test_jupyterlab.pdf',
 'page_num': 5,
 'Save_location': 'C:\\Users\\trilnd\\Desktop\\QBO\\File PDF\\Split_Combine_PDF\\Combine\\Output'}

## SPLIT FUNCTION

In [3]:
def Split_document(source_doc, 
                   start_page:int, 
                   end_page:int, 
                   save_location):
    """
    - Source document is the location of PDF file, 
    Ex: C:/Users/trilnd/Desktop/QBO/File PDF/Bank Statement/BofA.pdf

    - Start_page is first page number user want to be splited 
    - End_page is last page number user want to be splited 
    - Save location: the folder contains splited PDF file 
    Ex: C:/Users/trilnd/Desktop/QBO/File PDF/Bank Statement
    """
    info_dict = dict()
    
    try:
        # Connect with soure_doc
        document = fitz.open(filename=source_doc)
        base_name = os.path.basename(source_doc)[:-4]
        split_file_name = f'split_{base_name}.pdf'
        location = os.path.join(save_location,split_file_name)
        num_pages = document.page_count

        # Create blank document
        blank_doc = fitz.open()

        # Insert relevant page to new doc
        blank_doc.insert_pdf(docsrc=document,from_page=start_page-1,to_page=end_page-1)

        # Save new document in the metioned location
        blank_doc.save(filename=location)

        # Save info of doc to dict
        info_dict['pdf_name'] = base_name
        info_dict['page_num'] = num_pages
        info_dict['split_from'] = start_page
        info_dict['split_to'] = end_page
        info_dict['Save_location'] = save_location
        info_dict['pdf_split_name'] = split_file_name
    except Exception as e:
        info_dict['Error'] = e
        return info_dict
    else:
        return info_dict

### TESTING

In [4]:
_dir = r'C:\Users\trilnd\Desktop\QBO\File PDF\Checks_to_combine\Split\Citi_checking_1.pdf'
split_folder = r'C:\Users\trilnd\Desktop\QBO\File PDF\Checks_to_combine\Split\Output'

In [5]:
result =Split_document(source_doc=_dir,
                       start_page=5,
                       end_page=100,
                       save_location=split_folder)

In [6]:
result

{'pdf_name': 'Citi_checking_1',
 'page_num': 8,
 'split_from': 5,
 'split_to': 100,
 'Save_location': 'C:\\Users\\trilnd\\Desktop\\QBO\\File PDF\\Checks_to_combine\\Split\\Output',
 'pdf_split_name': 'split_Citi_checking_1.pdf'}

# BREAKDOWN FUNCTION

## Breakdown combine

In [20]:
# If the path does not contain that folder, will create new folder
if not os.path.isdir(split_folder):
    os.mkdir(split_folder)

In [106]:
root = Tk()
desktop_path = os.path.join(os.environ['USERPROFILE'], 'Desktop')
files_list = list(filedialog.askopenfilenames(parent=root,
                                              initialdir=desktop_path,
                                              title='Choose a file'))
root.mainloop()

In [90]:
os.path.isdir(split_folder)

True

In [100]:
files_list

('C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1764.PNG',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1772.PNG',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1773.PNG',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1890.pdf',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1891.pdf',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/9999.PNG')

In [92]:
# Test whether this file is PDF?
for i in range(len(files_list)):
    file = files_list[i]
    # Get extension
    f,e = os.path.splitext(file)
    if e[1:].lower() != 'pdf':
        img = Image.open(file)
        img = img.convert(mode='RGB')
        path_pdf_file = f'{f}.pdf'
        img.save(path_pdf_file,save_all=True)
        print(path_pdf_file)
        
        # update files_list:
        files_list[i] = path_pdf_file

C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1764.pdf
C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1765.pdf
C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1771.pdf
C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1772.pdf
C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/9999.pdf


In [93]:
files_list

['C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1764.pdf',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1765.pdf',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1771.pdf',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1772.pdf',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1881.pdf',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1885.pdf',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/1891.pdf',
 'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Combine/Test_pdf_png/9999.pdf']

In [94]:
blank_doc = fitz.open()
for file in files_list: 
    
    # Parse file PDF
    document = fitz.open(filename=file)
    base_name = os.path.basename(file)[:-4]
    print('File name',base_name,"Total page number:",document.page_count,sep=' -- ')
    
    # Insert document
    blank_doc.insert_pdf(docsrc=document)

File name -- 1764 -- Total page number: -- 1
File name -- 1765 -- Total page number: -- 1
File name -- 1771 -- Total page number: -- 1
File name -- 1772 -- Total page number: -- 1
File name -- 1881 -- Total page number: -- 1
File name -- 1885 -- Total page number: -- 1
File name -- 1891 -- Total page number: -- 1
File name -- 9999 -- Total page number: -- 1


In [95]:
combine_file_name = input()
save_location = r'C:\Users\trilnd\Desktop\QBO\File PDF\Checks_to_combine\Combine\Output'
combine_file_name = f'combine_{combine_file_name}.pdf'
location = os.path.join(save_location,combine_file_name)

120522-combine-pdf-png


In [96]:
# Save new document in the relevant location
blank_doc.save(filename=location)

## Breakdown Split

In [10]:
dir_ = r'C:\Users\trilnd\Desktop\QBO\File PDF\Bank Statement\BofA.pdf'
# r'C:\Users\trilnd\Desktop\QBO\File PDF\Checks_to_combine\Split\2022.08 SignatureChecking.pdf'
base_name = os.path.basename(dir_)[:-4]

In [11]:
doc1 = fitz.open(filename=dir_)
print("Total page number:",doc1.page_count)
doc1.metadata

Total page number: 8


{'format': 'PDF 1.5',
 'title': '',
 'author': 'Bank of America',
 'subject': '',
 'keywords': '',
 'creator': 'Bank of America',
 'producer': 'TargetStream StreamEDS rv1.7.41 for Bank of America',
 'creationDate': '',
 'modDate': '',
 'trapped': '',
 'encryption': None}

In [26]:
# for page in doc1:
#     area = page.search_for("Summary")
#     print(area)

In [19]:
blank_doc = fitz.open()

In [20]:
blank_doc.insert_pdf(docsrc=doc1,from_page=5,to_page=6)

In [25]:
split_doc = f'C:/Users/trilnd/Desktop/QBO/File PDF/Checks_to_combine/Split/Output/split_{base_name}.pdf'
blank_doc.save(filename=split_doc)

In [6]:
# root = Tk()
# filez = filedialog.askopenfilenames(parent=root,
#                                     initialdir=desktop_path,
#                                     title='Choose a file')
# filez
# root.mainloop()