In [5]:
import fitz               # PyMuPDF
import re
import os
import camelot           # pip install camelot-py[cv]
import pandas as pd

PDF_PATH = "TBI_Documents/CIGN, 2009.pdf"
OUTPUT_DIR = "CIGN"

os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_text(doc):
    """Extracts and cleans text from each page."""
    full_text = []
    for page in doc:
        text = page.get_text("text")
        # Simple de-hyphenation: join words split at line endings
        text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
        # Normalize line breaks
        text = re.sub(r'\n{2,}', '\n\n', text)
        full_text.append(text)
    return "\n\n".join(full_text)

def save_text(text, out_path):
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(text)

def extract_images(doc):
    """Extracts images and saves them with sequential filenames."""
    img_count = 0
    for page_index in range(len(doc)):
        for img_index, img in enumerate(doc.get_page_images(page_index), start=1):
            xref = img[0]
            base_image = doc.extract_image(xref)
            img_bytes = base_image["image"]
            img_ext = base_image["ext"]
            img_count += 1
            img_filename = f"sign_img{img_count}.{img_ext}"
            with open(os.path.join(OUTPUT_DIR, img_filename), "wb") as img_file:
                img_file.write(img_bytes)
    print(f"Extracted {img_count} images.")

def extract_tables(pdf_path):
    """Uses Camelot to find tables and save each as CSV."""
    # flavor='lattice' works on tables with ruling lines; try 'stream' otherwise
    tables = camelot.read_pdf(pdf_path, pages="all", flavor='lattice')
    print(f"Found {len(tables)} tables.")
    for i, table in enumerate(tables, start=1):
        csv_path = os.path.join(OUTPUT_DIR, f"sign_table{i}.csv")
        table.df.to_csv(csv_path, index=False)
    # If no tables found with lattice, you might try stream:
    if len(tables) == 0:
        tables = camelot.read_pdf(pdf_path, pages="all", flavor='stream')
        print(f"Stream mode found {len(tables)} tables.")
        for i, table in enumerate(tables, start=1):
            csv_path = os.path.join(OUTPUT_DIR, f"sign_table{i}.csv")
            table.df.to_csv(csv_path, index=False)

def main():
    doc = fitz.open(PDF_PATH)
    print(f"Opened '{PDF_PATH}' with {len(doc)} pages.")

    # 1. Extract and save text
    text = extract_text(doc)
    save_text(text, os.path.join(OUTPUT_DIR, "sign_text.txt"))
    print("Text extraction complete.")

    # 2. Extract and save images
    extract_images(doc)

    # 3. Extract and save tables
    extract_tables(PDF_PATH)

if __name__ == "__main__":
    main()


Opened 'TBI_documents/CIGN, 2009.pdf' (57 pages).
→ Cleaned text saved (starting from Summary of Recommendations).
→ Extracted 0 images into cign_output/
→ Detected 10 tables with flavor='lattice'.


In [7]:
import fitz               # pip install PyMuPDF
import os
import re
import camelot           # pip install camelot-py[cv]

PDF_PATH   = "TBI_documents/ACS, 2024.pdf"
OUTPUT_DIR = "acs_output"
SECT_DIR   = os.path.join(OUTPUT_DIR, "sections")
IMG_DIR    = os.path.join(OUTPUT_DIR, "figures")
TABLE_DIR  = os.path.join(OUTPUT_DIR, "tables")

# Create output dirs
for d in (SECT_DIR, IMG_DIR, TABLE_DIR):
    os.makedirs(d, exist_ok=True)

# Open the document
doc = fitz.open(PDF_PATH)

# 1) Split out each TOC entry as a separate text file
#    PyMuPDF's get_toc() gives [level, title, page_number]
toc = doc.get_toc()
sections = []
for idx, (level, title, page_num) in enumerate(toc):
    start = page_num - 1
    # end = start of next toc entry minus 1, or last page
    end = (toc[idx+1][2] - 2) if idx+1 < len(toc) else doc.page_count - 1
    sections.append((title, start, end))

for title, start, end in sections:
    # Collect text for this section
    text_chunks = [doc[p].get_text("text") for p in range(start, end+1)]
    all_text = "\n".join(text_chunks)
    # Sanitize filename
    fname = re.sub(r"[^\w]+", "_", title).strip("_") + ".txt"
    with open(os.path.join(SECT_DIR, fname), "w", encoding="utf-8") as f:
        f.write(all_text)

print(f"✅ Extracted {len(sections)} sections to {SECT_DIR}/")

# 2) Extract all images/figures
img_count = 0
for page_index in range(doc.page_count):
    for img in doc.get_page_images(page_index):
        xref = img[0]
        base_image = doc.extract_image(xref)
        img_bytes = base_image["image"]
        ext = base_image["ext"]
        img_count += 1
        out_path = os.path.join(IMG_DIR, f"figure_{img_count}.{ext}")
        with open(out_path, "wb") as img_file:
            img_file.write(img_bytes)

print(f"✅ Extracted {img_count} figures to {IMG_DIR}/")

# 3) Extract tables with Camelot
#    Try 'lattice' first (ruling-line tables), fall back to 'stream'
tables = camelot.read_pdf(PDF_PATH, pages="all", flavor="lattice")
if not tables:
    tables = camelot.read_pdf(PDF_PATH, pages="all", flavor="stream")

for i, table in enumerate(tables, start=1):
    csv_path = os.path.join(TABLE_DIR, f"table_{i}.csv")
    table.to_csv(csv_path, index=False)

print(f"✅ Extracted {len(tables)} tables to {TABLE_DIR}/")


✅ Extracted 0 sections to acs_output/sections/
✅ Extracted 9 figures to acs_output/figures/


KeyboardInterrupt: 

In [8]:
import fitz    # PyMuPDF (pip install PyMuPDF)
import os

PDF_PATH   = "TBI_documents/ACS, 2024.pdf"
OUTPUT_TXT = "acs_output/acs_two_column.txt"

def extract_two_column_text(pdf_path, out_path):
    doc = fitz.open(pdf_path)
    out_lines = []

    for pagenum in range(doc.page_count):
        page = doc[pagenum]
        width = page.rect.width
        # get all text blocks with their bboxes
        blocks = page.get_text("dict")["blocks"]
        # filter only text blocks
        text_blocks = [
            b for b in blocks
            if b["type"] == 0 and b["bbox"][2] - b["bbox"][0] > 20  # width > small threshold
        ]

        # split into left/right halves by x midpoint
        left_blocks  = [b for b in text_blocks if b["bbox"][0] < width/2]
        right_blocks = [b for b in text_blocks if b["bbox"][0] >= width/2]

        def sorted_text(blocks):
            # sort top→bottom by y0, then left→right by x0
            blocks = sorted(blocks, key=lambda b: (b["bbox"][1], b["bbox"][0]))
            text = []
            for b in blocks:
                # each block may have multiple lines
                for line in b["lines"]:
                    span_text = "".join(span["text"] for span in line["spans"])
                    text.append(span_text)
            return text

        # extract in reading order: left column then right column
        out_lines.append(f"--- Page {pagenum+1} (LEFT) ---")
        out_lines += sorted_text(left_blocks)
        out_lines.append(f"--- Page {pagenum+1} (RIGHT) ---")
        out_lines += sorted_text(right_blocks)
        out_lines.append("\n")

    # write everything to one file (or split per page if you prefer)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("\n".join(out_lines))

if __name__ == "__main__":
    extract_two_column_text(PDF_PATH, OUTPUT_TXT)
    print(f"Two-column text extracted to {OUTPUT_TXT}")


Two-column text extracted to acs_output/acs_two_column.txt


In [9]:
import fitz               # PyMuPDF
import os
import re
import camelot            # pip install camelot-py[cv]

# Paths
PDF_PATH   = "TBI_documents/BTF, 2017.pdf"
OUTPUT_DIR = "btf_output"
TEXT_PATH  = os.path.join(OUTPUT_DIR, "btf_text.txt")
IMG_DIR    = os.path.join(OUTPUT_DIR, "images")
TABLE_DIR  = os.path.join(OUTPUT_DIR, "tables")

# Create output directories
for d in (OUTPUT_DIR, IMG_DIR, TABLE_DIR):
    os.makedirs(d, exist_ok=True)

def extract_two_column_text(pdf_path):
    """
    Extracts text from a two-column PDF by splitting each page into
    left and right halves and ordering text blocks top-to-bottom.
    """
    doc = fitz.open(pdf_path)
    out_lines = []

    for p in range(doc.page_count):
        page = doc[p]
        width = page.rect.width
        blocks = [
            b for b in page.get_text("dict")["blocks"]
            if b["type"] == 0 and (b["bbox"][2] - b["bbox"][0]) > 20
        ]

        # Split blocks into left and right based on the midpoint
        left_blocks  = [b for b in blocks if b["bbox"][0] < width/2]
        right_blocks = [b for b in blocks if b["bbox"][0] >= width/2]

        def sort_and_extract(blocks):
            # Sort by vertical then horizontal position
            blocks = sorted(blocks, key=lambda b: (b["bbox"][1], b["bbox"][0]))
            text = []
            for b in blocks:
                for line in b["lines"]:
                    span_text = "".join(span["text"] for span in line["spans"])
                    text.append(span_text)
            return text

        out_lines.append(f"--- Page {p+1} (LEFT) ---")
        out_lines.extend(sort_and_extract(left_blocks))
        out_lines.append(f"--- Page {p+1} (RIGHT) ---")
        out_lines.extend(sort_and_extract(right_blocks))
        out_lines.append("\n")

    return "\n".join(out_lines)

# 1) Extract and save text
raw_text = extract_two_column_text(PDF_PATH)
# Optional: fix hyphenation
clean_text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', raw_text)
with open(TEXT_PATH, "w", encoding="utf-8") as f:
    f.write(clean_text)
print(f"Extracted text to {TEXT_PATH}")

# 2) Extract and save images
doc = fitz.open(PDF_PATH)
img_count = 0
for p in range(doc.page_count):
    for img in doc.get_page_images(p):
        xref = img[0]
        base_image = doc.extract_image(xref)
        img_bytes = base_image["image"]
        ext = base_image["ext"]
        img_count += 1
        img_name = f"figure_{img_count}.{ext}"
        with open(os.path.join(IMG_DIR, img_name), "wb") as img_file:
            img_file.write(img_bytes)
print(f"Extracted {img_count} images to {IMG_DIR}")

# 3) Extract and save tables
tables = camelot.read_pdf(PDF_PATH, pages="all", flavor="lattice")
if not tables:
    tables = camelot.read_pdf(PDF_PATH, pages="all", flavor="stream")
for i, table in enumerate(tables, start=1):
    csv_path = os.path.join(TABLE_DIR, f"table_{i}.csv")
    table.to_csv(csv_path, index=False)
print(f"Extracted {len(tables)} tables to {TABLE_DIR}")


Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)


Extracted text to btf_output/btf_text.txt
Extracted 0 images to btf_output/images


Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)


Extracted 5 tables to btf_output/tables


In [10]:
import fitz               # PyMuPDF (pip install PyMuPDF)
import os
import re
import camelot            # pip install camelot-py[cv] pandas

# Paths and setup
PDF_PATH    = "TBI_documents/CMA, 2007.pdf"
OUTPUT_DIR  = "cma_output"
TEXT_PATH   = os.path.join(OUTPUT_DIR, "cma_text.txt")
FIG_DIR     = os.path.join(OUTPUT_DIR, "figures")
TABLE_DIR   = os.path.join(OUTPUT_DIR, "tables")

# Create output directories
for d in (OUTPUT_DIR, FIG_DIR, TABLE_DIR):
    os.makedirs(d, exist_ok=True)

def extract_three_column_text(pdf_path):
    """
    Extracts text from a three-column PDF by splitting each page into
    thirds and ordering text blocks top-to-bottom within each column.
    """
    doc = fitz.open(pdf_path)
    lines = []

    for p in range(doc.page_count):
        page = doc[p]
        width = page.rect.width
        col_width = width / 3.0

        # Get text blocks with bounding boxes
        blocks = [
            b for b in page.get_text("dict")["blocks"]
            if b["type"] == 0 and (b["bbox"][2] - b["bbox"][0]) > 20
        ]

        # Split into left, middle, right columns
        left   = [b for b in blocks if b["bbox"][0] < col_width]
        middle = [b for b in blocks if col_width <= b["bbox"][0] < 2*col_width]
        right  = [b for b in blocks if b["bbox"][0] >= 2*col_width]

        def sort_and_extract(block_list):
            # Sort by vertical (y0) then horizontal (x0)
            sorted_blocks = sorted(block_list, key=lambda b: (b["bbox"][1], b["bbox"][0]))
            texts = []
            for b in sorted_blocks:
                for line in b["lines"]:
                    span_text = "".join(span["text"] for span in line["spans"])
                    texts.append(span_text)
            return texts

        # Append in reading order: left → middle → right
        lines.append(f"--- Page {p+1} (LEFT) ---")
        lines.extend(sort_and_extract(left))
        lines.append(f"--- Page {p+1} (MIDDLE) ---")
        lines.extend(sort_and_extract(middle))
        lines.append(f"--- Page {p+1} (RIGHT) ---")
        lines.extend(sort_and_extract(right))
        lines.append("\n")

    return "\n".join(lines)

# 1) Extract and save text
raw = extract_three_column_text(PDF_PATH)
# Fix hyphenation across line breaks
clean = re.sub(r'(\w+)-\n(\w+)', r'\1\2', raw)
with open(TEXT_PATH, "w", encoding="utf-8") as f:
    f.write(clean)
print(f"Text extracted to {TEXT_PATH}")

# 2) Extract and save figures/images
doc = fitz.open(PDF_PATH)
img_count = 0
for p in range(doc.page_count):
    for img in doc.get_page_images(p):
        xref = img[0]
        base = doc.extract_image(xref)
        img_bytes = base["image"]
        ext = base["ext"]
        img_count += 1
        fname = f"figure_{img_count}.{ext}"
        with open(os.path.join(FIG_DIR, fname), "wb") as img_file:
            img_file.write(img_bytes)
print(f"{img_count} images extracted to {FIG_DIR}")

# 3) Extract and save tables
# Try lattice (ruling lines) first then stream
tables = camelot.read_pdf(PDF_PATH, pages="all", flavor="lattice")
if not tables:
    tables = camelot.read_pdf(PDF_PATH, pages="all", flavor="stream")

for i, tbl in enumerate(tables, start=1):
    csv_path = os.path.join(TABLE_DIR, f"table_{i}.csv")
    tbl.df.to_csv(csv_path, index=False)
print(f"{len(tables)} tables extracted to {TABLE_DIR}")


Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)


Text extracted to cma_output/cma_text.txt
1 images extracted to cma_output/figures


Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)


1 tables extracted to cma_output/tables


In [1]:
import fitz               # PyMuPDF (pip install PyMuPDF)
import os
import re
import camelot            # pip install camelot-py[cv] pandas

# Paths and setup
PDF_PATH    = "TBI_documents/NIHCE, 2014.pdf"
OUTPUT_DIR  = "nhice_output"
TEXT_PATH   = os.path.join(OUTPUT_DIR, "nhice_text.txt")
FIG_DIR     = os.path.join(OUTPUT_DIR, "figures")
TABLE_DIR   = os.path.join(OUTPUT_DIR, "tables")

# Create output directories
for d in (OUTPUT_DIR, FIG_DIR, TABLE_DIR):
    os.makedirs(d, exist_ok=True)

def extract_two_column_text(pdf_path):
    """
    Extracts text from a two-column PDF by splitting each page into
    left and right halves and ordering text blocks top-to-bottom within each column.
    """
    doc = fitz.open(pdf_path)
    lines = []

    for p in range(doc.page_count):
        page = doc[p]
        width = page.rect.width
        # get text blocks
        blocks = [
            b for b in page.get_text("dict")["blocks"]
            if b["type"] == 0 and (b["bbox"][2] - b["bbox"][0]) > 20
        ]

        # Split into left and right based on midpoint
        left   = [b for b in blocks if b["bbox"][0] < width/2]
        right  = [b for b in blocks if b["bbox"][0] >= width/2]

        def sort_and_extract(block_list):
            # Sort by vertical then horizontal position
            sorted_blocks = sorted(block_list, key=lambda b: (b["bbox"][1], b["bbox"][0]))
            text = []
            for b in sorted_blocks:
                for line in b["lines"]:
                    span_text = "".join(span["text"] for span in line["spans"])
                    text.append(span_text)
            return text

        # Append in reading order: left → right
        lines.append(f"--- Page {p+1} (LEFT) ---")
        lines.extend(sort_and_extract(left))
        lines.append(f"--- Page {p+1} (RIGHT) ---")
        lines.extend(sort_and_extract(right))
        lines.append("\n")

    return "\n".join(lines)

# 1) Extract and save text
raw = extract_two_column_text(PDF_PATH)
# Fix hyphenation across line breaks
clean = re.sub(r'(\w+)-\n(\w+)', r'\1\2', raw)
with open(TEXT_PATH, "w", encoding="utf-8") as f:
    f.write(clean)
print(f"Text extracted to {TEXT_PATH}")

# 2) Extract and save figures/images
doc = fitz.open(PDF_PATH)
img_count = 0
for p in range(doc.page_count):
    for img in doc.get_page_images(p):
        xref = img[0]
        base = doc.extract_image(xref)
        img_bytes = base["image"]
        ext = base["ext"]
        img_count += 1
        fname = f"figure_{img_count}.{ext}"
        with open(os.path.join(FIG_DIR, fname), "wb") as img_file:
            img_file.write(img_bytes)
print(f"{img_count} images extracted to {FIG_DIR}")

# 3) Extract and save tables
# Try lattice (ruling lines) first then stream
tables = camelot.read_pdf(PDF_PATH, pages="all", flavor="lattice")
if not tables:
    tables = camelot.read_pdf(PDF_PATH, pages="all", flavor="stream")

for i, tbl in enumerate(tables, start=1):
    csv_path = os.path.join(TABLE_DIR, f"table_{i}.csv")
    tbl.df.to_csv(csv_path, index=False)
print(f"{len(tables)} tables extracted to {TABLE_DIR}")


Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)


Text extracted to nhice_output/nhice_text.txt
2 images extracted to nhice_output/figures


Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wron

39 tables extracted to nhice_output/tables


In [1]:
import fitz               # PyMuPDF (pip install PyMuPDF)
import os
import re
import camelot            # pip install camelot-py[cv] pandas

# Paths and setup
PDF_PATH    = "TBI_documents/NSW-MoH, 2011.pdf"
OUTPUT_DIR  = "nswh_output"
TEXT_PATH   = os.path.join(OUTPUT_DIR, "nswh_text.txt")
FIG_DIR     = os.path.join(OUTPUT_DIR, "figures")
TABLE_DIR   = os.path.join(OUTPUT_DIR, "tables")

# Create output directories
for path in (OUTPUT_DIR, FIG_DIR, TABLE_DIR):
    os.makedirs(path, exist_ok=True)

def extract_two_column_text(pdf_path):
    """
    Extracts text from a two-column PDF by splitting each page into
    left and right halves and ordering text blocks top-to-bottom within each column.
    """
    doc = fitz.open(pdf_path)
    lines = []

    for page_num in range(doc.page_count):
        page = doc[page_num]
        width = page.rect.width
        # Get text blocks with bounding boxes
        blocks = [
            b for b in page.get_text("dict")["blocks"]
            if b["type"] == 0 and (b["bbox"][2] - b["bbox"][0]) > 20
        ]

        # Split into left and right based on midpoint
        left   = [b for b in blocks if b["bbox"][0] < width / 2]
        right  = [b for b in blocks if b["bbox"][0] >= width / 2]

        def sort_and_extract(blocks_list):
            # Sort by vertical then horizontal position
            sorted_blocks = sorted(blocks_list, key=lambda b: (b["bbox"][1], b["bbox"][0]))
            text = []
            for b in sorted_blocks:
                for line in b["lines"]:
                    span_text = "".join(span["text"] for span in line["spans"])
                    text.append(span_text)
            return text

        # Append in reading order: left → right
        lines.append(f"--- Page {page_num+1} (LEFT) ---")
        lines.extend(sort_and_extract(left))
        lines.append(f"--- Page {page_num+1} (RIGHT) ---")
        lines.extend(sort_and_extract(right))
        lines.append("\n")

    return "\n".join(lines)

# 1) Extract and save text
raw = extract_two_column_text(PDF_PATH)
# Fix hyphenation across line breaks
clean = re.sub(r'(\w+)-\n(\w+)', r'\1\2', raw)
with open(TEXT_PATH, "w", encoding="utf-8") as f:
    f.write(clean)
print(f"Two-column text extracted to {TEXT_PATH}")

# 2) Extract and save figures/images
doc = fitz.open(PDF_PATH)
img_count = 0
for p in range(doc.page_count):
    for img in doc.get_page_images(p):
        xref = img[0]
        base = doc.extract_image(xref)
        img_bytes = base["image"]
        ext = base["ext"]
        img_count += 1
        fname = f"figure_{img_count}.{ext}"
        with open(os.path.join(FIG_DIR, fname), "wb") as img_file:
            img_file.write(img_bytes)
print(f"{img_count} images extracted to {FIG_DIR}")

# 3) Extract and save tables
for flavor in ("lattice", "stream"):
    try:
        tables = camelot.read_pdf(PDF_PATH, pages="all", flavor=flavor)
        if tables:
            for i, tbl in enumerate(tables, start=1):
                csv_path = os.path.join(TABLE_DIR, f"table_{i}.csv")
                tbl.df.to_csv(csv_path, index=False)
            print(f"{len(tables)} tables extracted using '{flavor}' flavor to {TABLE_DIR}")
            break
    except Exception as e:
        print(f"Error extracting tables with flavor '{flavor}': {e}")


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wrong pointing object 52 0 (offset 0)
Ignoring wrong pointing object 69 0 (offset 0)
Ignoring wrong pointing object 85 0 (offset 0)
Ignoring wrong pointing object 199 0 (offset 0)
Ignoring wrong pointing object 222 0 (offset 0)
Ignoring wrong pointing object 339 0 (offset 0)
Ignoring wrong pointing object 341 0 (offset 0)
Ignoring wrong pointing object 351 0 (offset 0)
Ignoring wrong pointing object 409 0 (offset 0)
Ignoring wrong pointing object 421 0 (offset 0)
Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wrong pointing object 52 0 (offset 0)
Ignoring wrong pointing object 69 0 (offset 0)
Ignoring wrong pointing object 85 0 (offset 0)
Ignoring wrong pointing object 199 0 (offset 0)
Ignoring wrong pointing object 222 0 (offset 0)
Ignoring wrong pointing object 339 0 (offset 0)
Ignoring wrong pointing object 341 0 (offset 0)
Igno

Two-column text extracted to nswh_output/nswh_text.txt
9 images extracted to nswh_output/figures


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wrong pointing object 52 0 (offset 0)
Ignoring wrong pointing object 69 0 (offset 0)
Ignoring wrong pointing object 85 0 (offset 0)
Ignoring wrong pointing object 199 0 (offset 0)
Ignoring wrong pointing object 222 0 (offset 0)
Ignoring wrong pointing object 339 0 (offset 0)
Ignoring wrong pointing object 341 0 (offset 0)
Ignoring wrong pointing object 351 0 (offset 0)
Ignoring wrong pointing object 409 0 (offset 0)
Ignoring wrong pointing object 421 0 (offset 0)
Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wrong pointing object 52 0 (offset 0)
Ignoring wrong pointing object 69 0 (offset 0)
Ignoring wrong pointing object 85 0 (offset 0)
Ignoring wrong pointing object 199 0 (offset 0)
Ignoring wrong pointing object 222 0 (offset 0)
Ignoring wrong pointing object 339 0 (offset 0)
Ignoring wrong pointing object 341 0 (offset 0)
Igno

69 tables extracted using 'lattice' flavor to nswh_output/tables


In [2]:
import fitz               # PyMuPDF (pip install PyMuPDF)
import os
import re
import camelot            # pip install camelot-py[cv] pandas

# Paths and setup
PDF_PATH    = "TBI_documents/SCN, 2013.pdf"
OUTPUT_DIR  = "scn_output"
TEXT_PATH   = os.path.join(OUTPUT_DIR, "scn_text.txt")
FIG_DIR     = os.path.join(OUTPUT_DIR, "figures")
TABLE_DIR   = os.path.join(OUTPUT_DIR, "tables")

# Create output directories
for path in (OUTPUT_DIR, FIG_DIR, TABLE_DIR):
    os.makedirs(path, exist_ok=True)

def extract_two_column_text(pdf_path):
    """
    Extracts text from a two-column PDF by splitting each page into
    left and right halves and ordering text blocks top-to-bottom within each column.
    """
    doc = fitz.open(pdf_path)
    all_lines = []

    for page_num in range(doc.page_count):
        page = doc[page_num]
        width = page.rect.width
        # Get all text blocks with their bounding boxes
        blocks = [
            b for b in page.get_text("dict")["blocks"]
            if b["type"] == 0 and (b["bbox"][2] - b["bbox"][0]) > 20
        ]

        # Split blocks into left and right columns
        left_blocks  = [b for b in blocks if b["bbox"][0] < width / 2]
        right_blocks = [b for b in blocks if b["bbox"][0] >= width / 2]

        def sort_and_extract(blocks_list):
            # Sort blocks top-down (by y0) then left-right (by x0)
            sorted_blocks = sorted(blocks_list, key=lambda b: (b["bbox"][1], b["bbox"][0]))
            text_lines = []
            for block in sorted_blocks:
                for line in block["lines"]:
                    span_text = "".join(span["text"] for span in line["spans"])
                    text_lines.append(span_text)
            return text_lines

        # Append text in reading order: left column, then right column
        all_lines.append(f"--- Page {page_num+1} (LEFT) ---")
        all_lines.extend(sort_and_extract(left_blocks))
        all_lines.append(f"--- Page {page_num+1} (RIGHT) ---")
        all_lines.extend(sort_and_extract(right_blocks))
        all_lines.append("\n")

    return "\n".join(all_lines)

# 1) Extract and save two-column text
raw_text = extract_two_column_text(PDF_PATH)
# Fix hyphenation across line breaks
clean_text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', raw_text)
with open(TEXT_PATH, "w", encoding="utf-8") as f:
    f.write(clean_text)
print(f"Two-column text extracted to {TEXT_PATH}")

# 2) Extract and save figures/images
doc = fitz.open(PDF_PATH)
img_count = 0
for p in range(doc.page_count):
    for img in doc.get_page_images(p):
        xref = img[0]
        base = doc.extract_image(xref)
        img_bytes = base["image"]
        ext = base["ext"]
        img_count += 1
        fname = f"figure_{img_count}.{ext}"
        with open(os.path.join(FIG_DIR, fname), "wb") as img_file:
            img_file.write(img_bytes)
print(f"{img_count} figures extracted to {FIG_DIR}")

# 3) Extract and save tables
# Try lattice first, then stream
tables = []
for flavor in ("lattice", "stream"):
    try:
        tables = camelot.read_pdf(PDF_PATH, pages="all", flavor=flavor)
        if tables:
            print(f"Detected {len(tables)} tables using flavor='{flavor}'.")
            break
    except Exception as e:
        print(f"Error with flavor='{flavor}': {e}")
# Save tables
for i, tbl in enumerate(tables, start=1):
    csv_path = os.path.join(TABLE_DIR, f"table_{i}.csv")
    tbl.df.to_csv(csv_path, index=False)
print(f"{len(tables)} tables saved to {TABLE_DIR}")


Two-column text extracted to scn_output/scn_text.txt


Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 2974 0 (offset 0)
Ignoring wrong pointing object 2982 0 (offset 0)
Ignoring wrong pointing object 3008 0 (offset 0)
Ignoring wrong pointing object 3042 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 2974 0 (offset 0)
Ignoring wrong pointing object 2982 0 (offset 0)
Ignoring wrong pointing object 3008 0 (offset 0)
Ignoring wrong pointing object 3042 0 (offset 0)


679 figures extracted to scn_output/figures


Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 2974 0 (offset 0)
Ignoring wrong pointing object 2982 0 (offset 0)
Ignoring wrong pointing object 3008 0 (offset 0)
Ignoring wrong pointing object 3042 0 (offset 0)
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 2974 0 (offset 0)
Ignoring wrong pointing object 2982 0 (offset 0)
Ignoring wrong pointing object

Detected 2 tables using flavor='lattice'.
2 tables saved to scn_output/tables
