In [None]:
from IPython.display import display, HTML, Markdown
from PIL import Image
import io
import base64
import pandas as pd
import time
import json
import os

# Function to display HTML and images
def display_html_and_image(html_content, pil_image):
    # Convert the PIL image to a base64 string to embed it directly in the HTML
    img_buffer = io.BytesIO()
    pil_image.save(img_buffer, format='PNG')
    img_data = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
    img_base64 = f"data:image/png;base64,{img_data}"

    # HTML content to display the image and HTML side by side
    html = f"""
    <div style="display: flex; align-items: center;">
        <div>{html_content}</div>
        <div><img src="{img_base64}" alt="Table Image"></div>
    </div>
    """

    display(HTML(html))

# Table detection and formatting setup
from gmft.pdf_bindings import PyPDFium2Document
from gmft.auto import CroppedTable, TableDetector, AutoTableFormatter, AutoFormatConfig

detector = TableDetector()
config = AutoFormatConfig()
config.semantic_spanning_cells = True
config.enable_multi_header = True
formatter = AutoTableFormatter(config)

# Function to ingest PDF and extract tables
def ingest_pdf(pdf_path) -> list[CroppedTable]:
    doc = PyPDFium2Document(pdf_path)
    tables = []
    for page in doc:
        tables += detector.extract(page)
    return tables, doc

# Create a directory to store CSV files if it doesn't exist
output_dir = "csv_outputs"
os.makedirs(output_dir, exist_ok=True)

# Processing the PDF files and saving output as CSV
_total_detect_time = 0
_total_detect_num = 0
_total_format_time = 0
_total_format_num = 0

results = []
images = []
dfs = []
for paper in ['jpmc-esg-report-2023.pdf', 'Metlife-sustainability-report2023.pdf']:
    start = time.time()
    tables, doc = ingest_pdf('./samples/' + paper)
    num_pages = len(doc)
    end_detect = time.time()

    formatted_tables = []
    for i, table in enumerate(tables):
        ft = formatter.extract(table)
        try:
            df = ft.df()
            dfs.append(df)

            # Save each DataFrame as a CSV file
            if df is not None:
                csv_filename = f"{output_dir}/{paper[:-4]}_table_{i}.csv"
                df.to_csv(csv_filename, index=False)
        except Exception as e:
            print(e)
            dfs.append(None)

        formatted_tables.append(ft)
        images.append(ft.image())
    end_format = time.time()

    doc.close()
    results += formatted_tables

    print(f"Paper: {paper}\nDetect time: {end_detect - start:.3f}s for {num_pages} pages")
    print(f"Format time: {end_format - end_detect:.3f}s for {len(tables)} tables\n")
    _total_detect_time += end_detect - start
    _total_detect_num += num_pages
    _total_format_time += end_format - end_detect
    _total_format_num += len(tables)

print(f"Macro: {_total_detect_time/_total_detect_num:.3f} s/page and {_total_format_time/_total_format_num:.3f} s/table.")
print(f"Total: {(_total_detect_time+_total_format_num)/(_total_detect_num)} s/page")

# Display tables and images
prev_doc = None
for df, img, ft in zip(dfs, images, results):
    with pd.option_context('display.max_rows', 500, "display.multi_sparse", False):
        if ft.page.filename != prev_doc:
            prev_doc = ft.page.filename
            display(Markdown('---'))
            display(Markdown(f'### {ft.page.filename}'))

        if df is not None:
            html = df.fillna("").to_html()
        else:
            html = "Failed to extract table"
        display_html_and_image(html, img)
        display(Markdown('---'))
