In [1]:
from pathlib import Path
ROOT_DIR = Path().resolve().parent.parent

pdf_path = ROOT_DIR / 'data' / 'pure_pdfs' / "PO 166939 - 204865    Summary and Detail Report.pdf"

In [2]:
import pdfplumber
import os
import pandas as pd
from tqdm.auto import tqdm

def extract_text_and_tables(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    text_output_path = os.path.join(output_folder, "extracted_text.txt")
    table_output_path = os.path.join(output_folder, "extracted_tables.csv")

    all_text = []
    all_tables = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_number in tqdm(range(1,len(pdf.pages)), desc="Processing Pages", unit="page"):
            page = pdf.pages[page_number]

            # Extract text
            text = page.extract_text()
            all_text.append(f"\n--- Page {page_number + 1} ---\n" + (text if text else ""))

            # Extract tables
            tables = page.extract_tables()
            for table in tables:
                df = pd.DataFrame(table)
                all_tables.append(df)

    # Save extracted text
    with open(text_output_path, "w", encoding="utf-8") as text_file:
        text_file.write("\n".join(all_text))

    # Save extracted tables
    if all_tables:
        final_df = pd.concat(all_tables, ignore_index=True)
        final_df.to_csv(table_output_path, index=False)


    print(f"Text saved to: {text_output_path}")

extract_text_and_tables(pdf_path, "extracted_data")

Processing Pages:   0%|          | 0/14 [00:00<?, ?page/s]

Text saved to: extracted_data\extracted_text.txt


In [5]:
with open("extracted_data/extracted_text.txt", "r", encoding="utf-8") as f:
    print(f.read())


--- Page 2 ---
Intake Pallet QC Inspection Report
Customer DPS PO 166939 / ISS PO 204865
Supplier Code : GUIM€
Supplier : Guimera Fruits
COO : Spain
Vehicle No : Received : 03/09/2024 09:59:25
Vessel : Inspection Date : 04/09/2024 06:35:59 Print date : 04/09/2024
Apricots 20x320g Punnet
ISS Pallet ID : 7761770 Freshness Technology :
Supplier Pallet ID : 5808239 Punnet / Pad Type : N/A /
Customer Pallet ID : 5808239 Outer :
Variety : Fardao Brand : CORE
Grower : / Organic? : NO Does Pallet Meet Spec? : YES
GGN : / PLU? :
BLUE Expected Qty 80
Orchard/Farm : End Customer : Tesco
Received Qty 80
Harvest Date : 30/08/2024 DP : 7
Total Defects : 5.00%
Size/Calibre : 40/45 Packhouse :
Estimated Yield : 100%
Lot Number : 508164 Inspector : Hanna.Dziuba
Minor : Dry Splits: 1.67%
Minor : Puncture: 1.67%
Major : Scarring: 1.67%
Defects Tot : 0.00% Defects Fruit Total : 0 Packs With Defects : 0.00%
Waste Tot : 0.00% Waste Fruit Total : 0 Packs With Waste : 0%
Minor Defects Tot : 3.33% Minor Fruit