In [22]:
import PyPDF2

def get_total_pages(pdf_path):
    with open(pdf_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        total_pages = len(pdf_reader.pages)
    return total_pages

pdf_path = 'ir-q4-2016-full-announcement.pdf'
print(f"The PDF has {get_total_pages(pdf_path)} pages.")

The PDF has 20 pages.


In [21]:
import tabula
import fitz  # PyMuPDF
import os
from PIL import Image

def extract_table_as_png(pdf_path, margin=10):
    pdf_document = fitz.open(pdf_path)
    pdf_name = os.path.basename(pdf_path).replace(".pdf", "")
    result_dir = f"result/{pdf_name}/tables_png"

    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    total_pages = len(pdf_document)
    table_count = 0  # Initialize a counter for all tables in the document

    for page_num in range(1, total_pages + 1):
        # Extracting tables from the PDF
        tables = tabula.read_pdf(pdf_path, pages=page_num, multiple_tables=True, output_format="json")

        # Convert the specific page to an image using PyMuPDF
        pdf_page = pdf_document[page_num - 1]
        page_image = pdf_page.get_pixmap()
        img = Image.frombytes("RGB", [page_image.width, page_image.height], page_image.samples)

        for table in tables:
            # Increment the table counter for each table found
            table_count += 1
            
            # Using the table count for naming, not the index within this page
            table_image_path = f"{result_dir}/page_{page_num}_table_{table_count}.png"
            
            # Assuming 'table' is a dictionary with 'top', 'left', 'width', 'height'
            top, left, bottom, right = (
                table['top'],
                table['left'],
                table['top'] + table['height'],
                table['left'] + table['width'],
            )

            # Adjust coordinates to make sure we capture everything
            top = max(top - margin, 0)
            bottom = min(bottom + margin, img.height)
            left = max(left - margin, 0)
            right = min(right + margin, img.width)

            # Crop the table region and save as PNG
            table_image = img.crop((left, top, right, bottom))
            table_image.save(table_image_path, "png")
            print(f"Saved {table_image_path}")

pdf_path = 'ir-q4-2016-full-announcement.pdf'
extract_table_as_png(pdf_path)


Saved result/ir-q4-2016-full-announcement/tables_png/page_1_table_1.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_1_table_2.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_2_table_3.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_3_table_4.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_3_table_5.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_5_table_6.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_6_table_7.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_7_table_8.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_7_table_9.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_8_table_10.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_8_table_11.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_9_table_12.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_11_table_13.png
Saved result/ir-q4-2016-full-announcement/tables_png/page_1

In [20]:
import tabula
import os

def extract_all_tables_to_csv(pdf_path, base_output_folder='result'):
    # Extract the PDF name without the extension
    pdf_name = os.path.basename(pdf_path).replace(".pdf", "")
    
    # Create the output directory path including the PDF name
    output_folder = os.path.join(base_output_folder, pdf_name, 'tables_csv')

    # Create output directory if it does not exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Read PDF file, pages="all" will read all the pages
    tables = tabula.read_pdf(pdf_path, pages="all", multiple_tables=True)

    # Loop through all detected tables
    for idx, table in enumerate(tables):
        # Generate a CSV file name based on the table index
        csv_file_name = f"table_{idx + 1}.csv"
        csv_file_path = os.path.join(output_folder, csv_file_name)

        # Save each table as a CSV file
        table.to_csv(csv_file_path, index=False)
        print(f"Saved Table {idx + 1} to '{csv_file_path}'")

# The path to your PDF file
pdf_path = 'ir-q1-2023-full-announcement.pdf'

# Call the function
extract_all_tables_to_csv(pdf_path)


Saved Table 1 to 'result/ir-q1-2023-full-announcement/tables_csv/table_1.csv'
Saved Table 2 to 'result/ir-q1-2023-full-announcement/tables_csv/table_2.csv'
Saved Table 3 to 'result/ir-q1-2023-full-announcement/tables_csv/table_3.csv'
Saved Table 4 to 'result/ir-q1-2023-full-announcement/tables_csv/table_4.csv'
Saved Table 5 to 'result/ir-q1-2023-full-announcement/tables_csv/table_5.csv'
Saved Table 6 to 'result/ir-q1-2023-full-announcement/tables_csv/table_6.csv'
Saved Table 7 to 'result/ir-q1-2023-full-announcement/tables_csv/table_7.csv'
Saved Table 8 to 'result/ir-q1-2023-full-announcement/tables_csv/table_8.csv'
Saved Table 9 to 'result/ir-q1-2023-full-announcement/tables_csv/table_9.csv'
Saved Table 10 to 'result/ir-q1-2023-full-announcement/tables_csv/table_10.csv'
Saved Table 11 to 'result/ir-q1-2023-full-announcement/tables_csv/table_11.csv'
Saved Table 12 to 'result/ir-q1-2023-full-announcement/tables_csv/table_12.csv'
Saved Table 13 to 'result/ir-q1-2023-full-announcement/tab