# Tips for Processing Complex-Structured, Multi-Page Tables in PDFs

In [1]:
import pdfplumber
from langchain.document_loaders import PyPDFLoader
import json
import os
from langchain_core.messages import HumanMessage
from langchain.prompts import PromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

In [2]:
# OpenAI

AZURE_OPENAI_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.environ.get('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_VERSION = os.environ.get('AZURE_OPENAI_VERSION')
AZURE_OPENAI_DEPLOYMENT_NAME = os.environ.get('AZURE_OPENAI_DEPLOYMENT_NAME')

In [3]:
# init OpenAI (or any other open source model)

oai = AzureChatOpenAI(
    openai_api_version=AZURE_OPENAI_VERSION,
    azure_deployment=AZURE_OPENAI_DEPLOYMENT_NAME,
)

## Extract pages w/ content

In [4]:
# two complex PDFs

shopify_report_path = "../data/Shopify_Q1_2023_Press_Release.pdf"
oregon_uni_path = "../data/university_oregon_accessible_table.pdf"

In [5]:
def load_pdf_file(file_path):

    try:
        # Load the PDF file
        pdf_loader = PyPDFLoader(file_path)
        pages = pdf_loader.load()
        print("DOCUMENT LOADED SUCCESSFULLY. \nTOTAL PAGES:", len(pages))
        return pages
    except Exception as e:
        print(f"An error occurred while loading the PDF: {e}")


shopify_pages = load_pdf_file(file_path=shopify_report_path)
oregon_pages = load_pdf_file(file_path=oregon_uni_path)

DOCUMENT LOADED SUCCESSFULLY. 
TOTAL PAGES: 14
DOCUMENT LOADED SUCCESSFULLY. 
TOTAL PAGES: 1


## Extract tables for each page

In [6]:
def extract_tables_from_pdf(file_path):

    tables_by_page = {}


    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages, start=0):
            tables = page.extract_tables()
            if tables:
                tables_by_page[i] = tables
            else:
                tables_by_page[i] = None

    print(f"TABLES SUCCESSFULLY EXTRACTED! \nTOTAL ITEMS EXTRACTED: {len(tables_by_page)}")

    return tables_by_page

shopify_tables = extract_tables_from_pdf(shopify_report_path)
oregon_tables = extract_tables_from_pdf(oregon_uni_path)

TABLES SUCCESSFULLY EXTRACTED! 
TOTAL ITEMS EXTRACTED: 14
TABLES SUCCESSFULLY EXTRACTED! 
TOTAL ITEMS EXTRACTED: 1


## Add tables as metadata

In [7]:
# add tables as metadata to the pages

def add_tables_to_documents(documents, tables):

    for doc in documents:
        page_number = doc.metadata['page']
        # Fetch the table data for the respective page
        table_data = tables.get(page_number)
        # Add table data to the document's metadata
        if table_data:
            doc.metadata['tables'] = table_data
        else:
            doc.metadata['tables'] = None

    return documents

oregon_docs = add_tables_to_documents(documents=oregon_pages, 
                                       tables=oregon_tables)
shopify_docs = add_tables_to_documents(documents=shopify_pages,
                                       tables=shopify_tables)

shopify_docs[6].metadata

{'source': '../data/Shopify_Q1_2023_Press_Release.pdf',
 'page': 6,
 'tables': [[['Revenues', '', None, ''],
   ['Subscription solutions', '382', None, '345'],
   ['Merchant solutions', '1,126', None, '859'],
   ['', '1,508', None, '1,204'],
   ['Cost of revenues', '', None, ''],
   ['Subscription solutions', '84', None, '78'],
   ['Merchant solutions', '707', None, '488'],
   ['', '791', None, '566'],
   ['Gross profit', '717', None, '638'],
   ['Operating expenses', '', None, ''],
   ['Sales and marketing', '287', None, '303'],
   ['Research and development', '458', None, '304'],
   ['General and administrative', '123', None, '109'],
   ['Transaction and loan losses', '42', None, '20'],
   ['Total operating expenses', '910', '', '736'],
   ['Loss from operations', None, None, None],
   ['', '', None, None],
   ['Other income (expense), net', None, None, None],
   ['Income (loss) before income taxes', '76', '', '(1,653)'],
   ['(Provision for) recovery of income taxes', '(8)', '', '17

In [8]:
metadata_field_info = [
    AttributeInfo(
        name="tables",
        description="A table which exists on the given page in the document",
        type="list"
    )
]

## Query using Self-Querying Retriever

In [9]:
# ToDo: add code to retrieve & filter by metadata + system prompt

In [10]:
# design system prompt

system_prompt = PromptTemplate(
    template="""
        ToDo: Here will be a system prompt template!

    """,
    input_variables=["context", "table"],
)

rag_chain = system_prompt | oai | StrOutputParser()