In [None]:
from pdf2image import convert_from_path
import pytesseract
import pdfplumber
import pandas as pd;
from PIL import Image

In [3]:
pdfs={
    "cardio_structured.pdf":[6],
    "prot_sap_102.pdf":[50],
    "prot_sap_1.pdf":[14]
}

In [4]:
def pdf_to_img(pdf_path,pages):
    img=convert_from_path(pdf_path,first_pg=min(pages),last_page=max(pages))
    return {page:img[i] for i, page in enumerate(pages)}

In [6]:
def extract_txt_from_image(img):
    return pytesseract.image_to_string(img, config="--psm 6") #using psm 6 beacuse it assumes a single uniform block of text

In [7]:
def extract_tables_from_pdf(pdf_path,pages):
    tables=[]
    with pdfplumber.open(pdf_path) as pdf:
        for page_num in pages:
            page=pdf.pages[page_num-1]
            extracted_tables=page.extract_tables()
            tables.extend(extracted_tables)
    return tables

In [8]:
def tables_to_dataframe(tables):
    return [pd.DataFrame(table) for table in tables if table]

In [9]:
def save_to_excel(df_list,output_file="extracted_tables.xlsx"):
    with pd.ExcelWriter(output_file,engine="openpyxl")as writer:
        for i, df in enumerate(df_list):
            df.to_excel(writer,sheet_name=f"Table_{i+1}",index=False,header=False)
    print(f"SAVED TO {output_file}")

In [10]:
def process_pdf(pdf_files):
    all_dfs=[]
    for pdf_path,pages in pdf_files.items():
        print(f"\nProcessing:{pdf_path}(pages:{pages})")
        tables=extract_tables_from_pdf(pdf_path,pages)
        if not tables:
            print(f"NO TABLES DETECTED IN {pdf_path}.")
            images=pdf_to_img(pdf_path,pages)
            extracted_text=[extract_txt_from_image(images[page]) for page in pages]
            tables=[text.split("\n") for text in extracted_text]

        df_list = tables_to_dataframe(tables)
        all_dfs.extend(df_list)
    
    if all_dfs:
        save_to_excel(all_dfs)
    else:
        print("no tables extracted.")

In [11]:
process_pdf(pdfs)


Processing:cardio_structured.pdf(pages:[6])

Processing:prot_sap_102.pdf(pages:[50])

Processing:prot_sap_1.pdf(pages:[14])
SAVED TO extracted_tables.xlsx
