# Imports

In [1]:
%reset -f

In [2]:
import io

from pdfminer.high_level import extract_text
import fitz # from pymupdf
import PIL.Image
import tabula

In [3]:
FILENAME = 'statement.pdf'

# Reading contents

## OCR (extract all text from pdf)

In [4]:
text = extract_text(FILENAME)
text

"Financial Statements for the Year Ended \n31 December 2021 \n\nTogether with Auditor’s Report \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\x0cTable of Contents \n\n  Page \n----------------------------------------------------------------------------------------------------------------------------- ---- \n\nIndependent Auditor’s Report \nIncome Statement \nStatement of Comprehensive Income \nStatement of Financial Position   \nStatement of Changes in Members’ Equity \nStatement of Cash Flows \nNotes to the Financial Statements \n\n2 –     5 \n          6 \n          7 \n          8 \n          9 \n        10 \n11 – 74 \n\n1 \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n 

## Extract all images

In [5]:
pages = fitz.open(FILENAME)
pages

Document('statement.pdf')

In [6]:
for idx, page in enumerate(pages):
    images = page.get_images()
    for image in images:
        base_img = pages.extract_image(image[0])
        image_data = base_img['image']
        img = PIL.Image.open(io.BytesIO(image_data))
        extension = base_img['ext']
        with open(f'./images/image_{idx:02}.{extension}', 'wb') as f:
            img.save(f)

## Extract tables

In [7]:
dfs = tabula.read_pdf(FILENAME, pages='all')
print(f'Extracted a total of {len(dfs)} tables.')

Oct 22, 2023 1:31:26 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:27 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:28 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:28 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:29 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:29 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:29 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:30 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:30 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:31 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:31 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:32 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Oct 22, 2023 1:31:32 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>

Extracted a total of 50 tables.


In [8]:
dfs[0]

Unnamed: 0,Independent Auditor’s Report,2 –5
0,Income Statement,6
1,Statement of Comprehensive Income,7
2,Statement of Financial Position,8
3,Statement of Changes in Members’ Equity,9
4,Statement of Cash Flows,10
5,Notes to the Financial Statements,11 – 74


# Save to files

In [9]:
with open('ocr.txt', 'w') as f:
    f.write(text)

In [10]:
for idx, df in enumerate(dfs):
    df.to_csv(f'tables/table_{idx:02}.csv', index_label='Index')