
# Module 5 · Session 2 — PDF & Messy Document Extraction Demo

This notebook demonstrates practical PDF data extraction workflows in Python:
- Inspect PDFs and metadata
- Extract text and tables from **text-based** PDFs
- Handle **scanned image** PDFs with OCR
- Package extraction into reusable functions
- Validate and clean outputs


In [1]:
# !sudo apt-get update
# !sudo apt-get install -y default-jre
!java -version

openjdk version "21.0.8" 2025-07-15
OpenJDK Runtime Environment (build 21.0.8+9-Debian-1)
OpenJDK 64-Bit Server VM (build 21.0.8+9-Debian-1, mixed mode, sharing)


## 0) Environment setup

In [None]:

# If running on a fresh environment, uncomment installs as needed.
# %pip install --quiet PyPDF2 pdfplumber tabula-py pytesseract pdf2image pandas pillow poppler-utils tesseract-ocr jpype1

import os, sys, io, json, re, math, shutil, tempfile, pathlib
from pathlib import Path

import pandas as pd

# Optional imports gated by availability
try:
    from PyPDF2 import PdfReader
except Exception as e:
    PdfReader = None

try:
    import pdfplumber
except Exception as e:
    pdfplumber = None

try:
    import tabula  # requires Java installed
except Exception as e:
    tabula = None

# OCR dependencies (optional)
try:
    from pdf2image import convert_from_path
    import pytesseract
    from PIL import Image
except Exception as e:
    convert_from_path = None
    pytesseract = None

print("PyPDF2:", bool(PdfReader))
print("pdfplumber:", bool(pdfplumber))
print("tabula:", bool(tabula))
print("pdf2image:", bool(convert_from_path))
print("pytesseract:", bool(pytesseract))


PyPDF2: True
pdfplumber: True
tabula: True
pdf2image: True
pytesseract: True


## 1) Choose a PDF

In [6]:
PDF_PATH1 = "./data/session2_demo.pdf"  
assert isinstance(PDF_PATH1, str) and len(PDF_PATH1) > 0
print("Target PDF:", PDF_PATH1)
PDF_PATH2 = "./data/pdf_demo_landscape.pdf"  
PDF_PATH3 = "./data/pdf_demo_multi_page.pdf"  
PDF_PATH4 = "./data/pdf_demo_simple_table.pdf"  

Target PDF: ./data/session2_demo.pdf


## 2) Quick inspection with PyPDF2

In [11]:

if PdfReader is None:
    print("PyPDF2 not available. Install it with: %pip install PyPDF2")
else:
    try:
        reader = PdfReader(PDF_PATH3)
        print("Pages:", len(reader.pages))
        meta = reader.metadata or {}
        print("Metadata keys:", list(meta.keys()))
        # Peek at first 500 characters of raw text from page 1 (if any)
        raw_text = reader.pages[1].extract_text() or ""
        print("First page text preview:\n", raw_text[:500])
    except Exception as e:
        print("Failed to open/inspect PDF with PyPDF2:", e)


Pages: 2
Metadata keys: ['/Author', '/CreationDate', '/Creator', '/Keywords', '/ModDate', '/Producer', '/Subject', '/Title', '/Trapped']
First page text preview:
 Financials Q1
Department
Budget
Spent
HR
$100,000
$95,000
IT
$250,000
$200,000
Marketing
$150,000
$175,000



## 3) Text extraction with pdfplumber

In [15]:

if pdfplumber is None:
    print("pdfplumber not available. Install it with: %pip install pdfplumber")
else:
    try:
        with pdfplumber.open(PDF_PATH4) as pdf:
            n_pages = len(pdf.pages)
            print("Pages:", n_pages)
            # Extract text from all pages
            texts = []
            for i, page in enumerate(pdf.pages):
                t = page.extract_text() or ""
                texts.append(t)
                if i == 0:
                    print("First page text preview (pdfplumber):\n", t[:500])
            # Save full text for inspection
            full_text_path = Path("./data/extracted_text.txt")
            full_text_path.write_text("\n\n".join(texts), encoding="utf-8")
            print("Saved all text to", full_text_path.resolve())
    except Exception as e:
        print("pdfplumber failed:", e)


Pages: 1
First page text preview (pdfplumber):
 Simple Sales Report
Item Quantity Price
Widget A 10 $9.99
Widget B 5 $12.50
Widget C 2 $199.00
Saved all text to /workspaces/examples/Module05/data/extracted_text.txt


### 3a) Table extraction with pdfplumber

In [18]:

def extract_tables_pdfplumber(pdf_path, page_indexes=None):
    """Attempt simple table extraction per page using pdfplumber's built-in heuristic."""
    if pdfplumber is None:
        raise RuntimeError("pdfplumber not available")
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        if page_indexes is None:
            page_indexes = range(len(pdf.pages))
        for idx in page_indexes:
            page = pdf.pages[idx]
            table = page.extract_table()
            if table:
                df = pd.DataFrame(table[1:], columns=table[0])
                df["__page__"] = idx + 1
                tables.append(df)
    return tables

if pdfplumber is not None:
    try:
        dfs = extract_tables_pdfplumber(PDF_PATH4)
        if dfs:
            merged = pd.concat(dfs, ignore_index=True)
            print("Extracted tables with pdfplumber:", merged.shape)
        else:
            print("No tables detected by pdfplumber's simple heuristic.")
    except Exception as e:
        print("pdfplumber table extraction failed:", e)


No tables detected by pdfplumber's simple heuristic.


## 4) Table extraction with tabula-py

In [22]:

def extract_tables_tabula(pdf_path, pages="all", multiple_tables=True):
    if tabula is None:
        raise RuntimeError("tabula-py not available or Java not installed")
    # options: stream=True for whitespace detection, lattice=True for ruled tables
    try:
        dfs = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=multiple_tables, guess=True)
        return dfs or []
    except Exception as e:
        print("tabula-py failed:", e)
        return []

if tabula is not None:
    dfs_tab = extract_tables_tabula(PDF_PATH1, pages="all", multiple_tables=True)
    if dfs_tab:
        # Show the first table
        print("tabula-py extracted", len(dfs_tab), "table(s). Showing the first:")
        df0 = dfs_tab[0]
        # Show first few rows for inspection
        display(df0.head())
    else:
        print("tabula-py found no tables. Try setting lattice=True or stream=True via tabula.read_pdf kwargs.")
else:
    print("tabula-py unavailable. Requires Java. Install Java and tabula-py if needed.")


Got stderr: Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:35:17 PM or

tabula-py extracted 1 table(s). Showing the first:


Unnamed: 0,Item,Qty,Price
0,Widget A,10,$9.99
1,Widget B,4,$14.50
2,Widget C,25,"$2,499.00"


## 5) OCR workflow for scanned PDFs (pdf2image + Tesseract)

In [27]:

def ocr_pdf_to_text(pdf_path, dpi=300, max_pages=2):
    if convert_from_path is None or pytesseract is None:
        raise RuntimeError("pdf2image or pytesseract not available")
    pages = convert_from_path(pdf_path, dpi=dpi)
    texts = []
    for i, img in enumerate(pages[:max_pages]):
        # Optional preprocessing could be added here
        txt = pytesseract.image_to_string(img)
        texts.append(txt)
        print(f"OCR page {i+1} preview:\n", txt[:300])
    return "\n\n".join(texts)

# Run OCR only if libraries are available
if convert_from_path is not None and pytesseract is not None:
    try:
        ocr_text = ocr_pdf_to_text(PDF_PATH3, dpi=300, max_pages=2)
        Path("ocr_text_preview.txt").write_text(ocr_text, encoding="utf-8")
        print("Saved OCR preview to ocr_text_preview.txt")
    except Exception as e:
        print("OCR step failed:", e)
else:
    print("OCR libraries unavailable. Install pdf2image and pytesseract and ensure Tesseract is on PATH.")


OCR page 1 preview:
 Quarterly Report

This is a text-heavy page with no tables.

OCR page 2 preview:
 Financials Q1

Department
HR

IT
Marketing

Budget

$100,000
$250,000
$150,000

Spent
$95,000
$200,000
$175,000

Saved OCR preview to ocr_text_preview.txt


## 6) Visualize character bounding boxes (diagnostics)

In [28]:

if pdfplumber is None:
    print("pdfplumber not available for visualization.")
else:
    try:
        with pdfplumber.open(PDF_PATH1) as pdf:
            page = pdf.pages[0]
            chars = page.chars[:50]  # preview first 50 characters
            print("First 5 char boxes:")
            for c in chars[:5]:
                print({k: c[k] for k in ["text", "x0", "top", "x1", "bottom"]})
    except Exception as e:
        print("Failed to read character boxes:", e)


First 5 char boxes:
{'text': 'S', 'x0': 72.0, 'top': 61.03800000000001, 'x1': 79.784, 'bottom': 75.03800000000001}
{'text': 'e', 'x0': 79.784, 'top': 61.03800000000001, 'x1': 86.0, 'bottom': 75.03800000000001}
{'text': 's', 'x0': 86.0, 'top': 61.03800000000001, 'x1': 91.446, 'bottom': 75.03800000000001}
{'text': 's', 'x0': 91.446, 'top': 61.03800000000001, 'x1': 96.892, 'bottom': 75.03800000000001}
{'text': 'i', 'x0': 96.892, 'top': 61.03800000000001, 'x1': 100.78399999999999, 'bottom': 75.03800000000001}


## 7) Robust wrapper functions

In [30]:

def extract_pdf_all(pdf_path, try_tabula=True, try_plumber=True, ocr_if_needed=False):
    """High-level extraction attempt. Returns dict of artifacts."""
    result = {
        "metadata": {},
        "text_plumber": None,
        "tables_plumber": [],
        "tables_tabula": [],
        "ocr_text": None,
        "errors": []
    }
    # Metadata
    if PdfReader is not None:
        try:
            reader = PdfReader(pdf_path)
            result["metadata"] = dict(reader.metadata or {})
        except Exception as e:
            result["errors"].append(f"PyPDF2 metadata: {e}")
    # pdfplumber
    if try_plumber and pdfplumber is not None:
        try:
            with pdfplumber.open(pdf_path) as pdf:
                texts = []
                tables = []
                for i, page in enumerate(pdf.pages):
                    t = page.extract_text() or ""
                    texts.append(t)
                    tb = page.extract_table()
                    if tb:
                        df = pd.DataFrame(tb[1:], columns=tb[0])
                        df["__page__"] = i + 1
                        tables.append(df)
                result["text_plumber"] = "\n\n".join(texts)
                result["tables_plumber"] = tables
        except Exception as e:
            result["errors"].append(f"pdfplumber: {e}")
    # tabula
    if try_tabula and tabula is not None:
        try:
            dfs = tabula.read_pdf(pdf_path, pages="all", multiple_tables=True, guess=True)
            result["tables_tabula"] = dfs or []
        except Exception as e:
            result["errors"].append(f"tabula: {e}")
    # OCR
    if ocr_if_needed and convert_from_path is not None and pytesseract is not None:
        try:
            result["ocr_text"] = ocr_pdf_to_text(pdf_path, dpi=300, max_pages=2)
        except Exception as e:
            result["errors"].append(f"OCR: {e}")
    return result

# Example run (safe; will handle missing deps and missing file gracefully)
try:
    res = extract_pdf_all(PDF_PATH1, try_tabula=True, try_plumber=True, ocr_if_needed=False)
    print("Artifacts:", list(res.keys()))
    print("Errors:", res["errors"])
    if res["tables_plumber"]:
        df_all = pd.concat(res["tables_plumber"], ignore_index=True)
        print("Combined tables extracted with pdfplumber:")
        display(df_all.head())  # show first rows
    elif res["tables_tabula"]:
        df_all = pd.concat(res["tables_tabula"], ignore_index=True)
        print("Combined tables extracted with tabula-py:")
        display(df_all.head())
    else:
        print("No tables extracted.")

except Exception as e:
    print("Wrapper example failed:", e)


Got stderr: Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 18, 2025 2:40:53 PM or

Artifacts: ['metadata', 'text_plumber', 'tables_plumber', 'tables_tabula', 'ocr_text', 'errors']
Errors: []
Combined tables extracted with tabula-py:


Unnamed: 0,Item,Qty,Price
0,Widget A,10,$9.99
1,Widget B,4,$14.50
2,Widget C,25,"$2,499.00"


## 8) Validate and clean extracted data

In [31]:

# Example: normalize numbers and dates in a DataFrame
import pandas as pd
import numpy as np

def clean_numeric_series(s):
    return (s.astype(str)
              .str.replace(r"[^0-9.\-]", "", regex=True)
              .replace({"": np.nan})
              .astype(float))

# Demo with a synthetic DataFrame if no tables were found
demo = pd.DataFrame({
    "Amount": ["$1,234.56", "€789.00", "1 000,00", "N/A"],
    "Date": ["2025-09-01", "09/02/2025", "Sept 03, 2025", "—"]
})
demo["Amount_clean"] = clean_numeric_series(demo["Amount"])
print(demo)


      Amount           Date  Amount_clean
0  $1,234.56     2025-09-01       1234.56
1    €789.00     09/02/2025        789.00
2   1 000,00  Sept 03, 2025     100000.00
3        N/A              —           NaN


## 9) Export artifacts

In [32]:

def export_tables_to_csv(dfs, stem="tables"):
    out_paths = []
    for i, df in enumerate(dfs):
        p = Path(f"{stem}_{i+1}.csv")
        df.to_csv(p, index=False)
        out_paths.append(str(p.resolve()))
    return out_paths

# Example usage if you extracted tables with tabula or pdfplumber:
# paths = export_tables_to_csv(res["tables_tabula"] or res["tables_plumber"], stem="extracted")
# print(paths)



## 10) Troubleshooting

- **No text extracted** but the PDF looks like a scan → enable the OCR section and install Tesseract.
- **Tables not detected** → try `tabula` with `lattice=True` for ruled tables or `stream=True` for whitespace tables.
- **Java error** with `tabula` → install a JRE and ensure `java` is on PATH.
- **Rotated pages** → pre-rotate with PyPDF2 or use page rotation options before extraction.
