In [7]:
import PyPDF2

def get_total_pages(pdf_path):
    with open(pdf_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        total_pages = len(pdf_reader.pages)
    return total_pages

pdf_path = 'ir-q1-2023-full-announcement.pdf'
print(f"The PDF has {get_total_pages(pdf_path)} pages.")

The PDF has 9 pages.


In [8]:
from sentence_transformers import SentenceTransformer, util
import torch
from PIL import Image
import os
import io
from collections import defaultdict
import json
import re

In [9]:
def preprocess_context(context:str):
    context = re.sub(r"\\u[0-9A-Fa-f]{4}", "", context)
    rows = context.split('\n')
    rows = [row[:77] for row in rows if row != '']
    return rows

In [10]:
#Load CLIP model
model = SentenceTransformer('clip-ViT-B-32')
def get_similar_sentences(image_emb, corpus_emb, top_k=1):

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(image_emb, corpus_emb)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    return top_results

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

0_CLIPModel/config.json:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

0_CLIPModel/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

0_CLIPModel/preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

0_CLIPModel/special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

0_CLIPModel/tokenizer_config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

0_CLIPModel/vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

In [12]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.23.8-cp310-none-manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.7 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.7-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.8 PyMuPDFb-1.23.7


In [16]:
import os
import re
import json
import torch
from PIL import Image
import tabula
import fitz
from sentence_transformers import SentenceTransformer, util


# Extract table as png
def extract_table_as_png(pdf_path, margin=10):

    pdf_document = fitz.open(pdf_path)
    pdf_name = os.path.basename(pdf_path).replace(".pdf", "")
    result_dir = f"result/{pdf_name}/tables_png"

    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    total_pages = len(pdf_document)
    table_count = 0

    dpi = 300
    captions = {}

    for page_num in range(1, total_pages + 1):
        tables = tabula.read_pdf(pdf_path, pages=page_num, multiple_tables=True, output_format="json")
        # Convert the specific page to an image using PyMuPDF
        pdf_page = pdf_document[page_num - 1]
        page_image = pdf_page.get_pixmap(dpi=dpi)
        img = Image.frombytes("RGB", [page_image.width, page_image.height], page_image.samples)

        # Process each extracted table
        for table in tables:
            table_count += 1
            table_image_path = f"{result_dir}/page_{page_num}_table_{table_count}.png"
            scale_factor = dpi / 72
            top, left, bottom, right = (
                table['top'] * scale_factor,
                table['left'] * scale_factor,
                (table['top'] + table['height']) * scale_factor,
                (table['left'] + table['width']) * scale_factor,
            )

            # Adjust the bounding box by a margin
            top = max(top - margin, 0)
            bottom = min(bottom + margin, img.height)
            left = max(left - margin, 0)
            right = min(right + margin, img.width)

            # Crop and save the table image
            table_image = img.crop((left, top, right, bottom))
            table_image.save(table_image_path, "PNG")
            print(f"Saved {table_image_path}")

            # Extract text for semantic search
            raw_context = pdf_page.get_text("text")
            context = preprocess_context(raw_context)
            if len(context) == 0:
                continue
            top_k = min(len(context), 3)
            corpus_emb = model.encode(context, convert_to_tensor=True)
            image_emb = model.encode([table_image], convert_to_tensor=True)
            top_results = get_similar_sentences(image_emb, corpus_emb, top_k=top_k)

            for k in range(top_k):
                if top_results.values[k] < 0.3:
                    break
                caption_key = f"page_{page_num}_table_{table_count}"
                captions[caption_key] = context[top_results.indices[k]]

    # Save captions to a JSON file
    with open(os.path.join(result_dir, "captions.json"), 'w') as caption_file:
        json.dump(captions, caption_file, indent=4)

pdf_path = 'ir-q1-2023-full-announcement.pdf'
extract_table_as_png(pdf_path)

Saved result/ir-q1-2023-full-announcement/tables_png/page_1_table_1.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_3_table_2.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_4_table_3.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_4_table_4.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_4_table_5.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_4_table_6.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_5_table_7.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_5_table_8.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_5_table_9.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_6_table_10.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_7_table_11.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_7_table_12.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_7_table_13.png
Saved result/ir-q1-2023-full-announcement/tables_png/page_7_

In [None]:
import tabula
import os

# save tables to csv format
def extract_all_tables_to_csv(pdf_path, base_output_folder='result'):
    # Extract the PDF name without the extension
    pdf_name = os.path.basename(pdf_path).replace(".pdf", "")

    output_folder = os.path.join(base_output_folder, pdf_name, 'tables_csv')

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    tables = tabula.read_pdf(pdf_path, pages="all", multiple_tables=True)

    # Loop through all detected tables
    for idx, table in enumerate(tables):
        # Generate a CSV file name based on the table index
        csv_file_name = f"table_{idx + 1}.csv"
        csv_file_path = os.path.join(output_folder, csv_file_name)

        # Save each table as a CSV file
        table.to_csv(csv_file_path, index=False)
        print(f"Saved Table {idx + 1} to '{csv_file_path}'")

# The path to your PDF file
pdf_path = '2017_annual_report.pdf'

# Call the function
extract_all_tables_to_csv(pdf_path)


Saved Table 1 to 'result/2016_annual_report Nestle/tables_csv/table_1.csv'
Saved Table 2 to 'result/2016_annual_report Nestle/tables_csv/table_2.csv'
Saved Table 3 to 'result/2016_annual_report Nestle/tables_csv/table_3.csv'
Saved Table 4 to 'result/2016_annual_report Nestle/tables_csv/table_4.csv'
Saved Table 5 to 'result/2016_annual_report Nestle/tables_csv/table_5.csv'
Saved Table 6 to 'result/2016_annual_report Nestle/tables_csv/table_6.csv'
Saved Table 7 to 'result/2016_annual_report Nestle/tables_csv/table_7.csv'
Saved Table 8 to 'result/2016_annual_report Nestle/tables_csv/table_8.csv'
Saved Table 9 to 'result/2016_annual_report Nestle/tables_csv/table_9.csv'
Saved Table 10 to 'result/2016_annual_report Nestle/tables_csv/table_10.csv'
Saved Table 11 to 'result/2016_annual_report Nestle/tables_csv/table_11.csv'
Saved Table 12 to 'result/2016_annual_report Nestle/tables_csv/table_12.csv'
Saved Table 13 to 'result/2016_annual_report Nestle/tables_csv/table_13.csv'
Saved Table 14 to