In [1]:
import io
import re
import pandas as pd

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from PyPDF2 import PdfFileReader, PdfFileWriter

report_path = r'example_punch.pdf'
sorted_report_path = r'emp_sorted_report.pdf'

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [2]:
#extract the text from the pdf
report_txt = convert_pdf_to_txt(report_path)

In [3]:
#remove the cids
cid_pattern = re.compile(r'\(cid:\d\d\)\n')
cid_removed = re.sub(cid_pattern, '', report_txt)

#clean up the emp number
emp_id_pattern = re.compile(r'Emp\s#\s')
emp_id_fix = r'EmpID '
emp_fixed = re.sub(emp_id_pattern, emp_id_fix, cid_removed)

In [4]:
#replace all newlines w/ space
cleaned = emp_fixed.replace('\n', ' ')

In [5]:
#get all the employee ids
ids = re.findall(r'EmpID\s[\d]+', cleaned)

In [6]:
#put emp ids in a df
df = pd.DataFrame({'Emp_ID': ids})

In [None]:
'''#get all the page numbers
pages = re.findall(r'Page:\s[\d]+', cleaned)
pages'''

In [None]:
'''#delete the report page
del pages[-1]
pages'''

In [None]:
'''#add pages to the df
df['Page'] = pages
df'''

In [None]:
'''#extract the numbers
def get_nums(col):
    df[col] = df[col].str.extract(r'(\d+)').astype(int)
get_nums('Emp_ID')
get_nums('Page')
df'''

In [7]:
#extract the numbers
df['Emp_ID'] = df['Emp_ID'].str.extract(r'(\d+)').astype(int)

In [10]:
#get the number of pages of employee timesheets
emp_pages = len(df)

#get the total number of pages
with open(report_path, 'rb') as infile:
    reader = PdfFileReader(infile)
    total_pages = reader.getNumPages()

#calculate the report page length
report_pages = total_pages - emp_pages
report_pages

4

In [12]:
#add the report page(s)
rep_length = report_pages
while rep_length:
    place = 0 - rep_length
    df = df.append({'Emp_ID': place}, ignore_index=True)
    rep_length -= 1
df

Unnamed: 0,Emp_ID
0,60
1,765
2,813
3,63
4,760
5,857
6,50
7,45
8,770
9,860


In [13]:
#sort by emp id
df.sort_values(by=['Emp_ID'], inplace=True)
df.head()

Unnamed: 0,Emp_ID
30,-4
31,-3
32,-2
33,-1
10,44


In [None]:
'''#get the desired page order
page_order = df['Page'].tolist()
page_order'''

In [None]:
'''#add a final page for the report
page_order.append(1 + max(page_order))
page_order'''

In [15]:
#put the index into a list
page_order = df.index.tolist()
print(page_order)

[30, 31, 32, 33, 10, 7, 6, 11, 0, 3, 19, 22, 25, 21, 18, 29, 26, 27, 28, 23, 20, 17, 24, 4, 13, 1, 15, 8, 2, 12, 5, 9, 16, 14]


In [16]:
#reorder pages into a new pdf
writer = PdfFileWriter()
with open(report_path, 'rb') as infile:
    
    reader = PdfFileReader(infile)
    for entry in page_order:
        writer.addPage(reader.getPage(entry))

    with open(sorted_report_path, 'wb') as outfile:
        writer.write(outfile)