In [1]:
# Install dependencies
!pip install pdfplumber PyMuPDF camelot-py[cv] pandas

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting camelot-py[cv]
  Downloading camelot_py-1.0.9-py3-none-any.whl.metadata (9.8 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting pypdf<6.0,>=4.0 (from camelot-py[cv])
  Downloading pypdf-5.9.0-py3-none-

In [2]:
import os
import fitz  # PyMuPDF
import pdfplumber
import camelot
import json
import re

In [15]:
# ==== MAIN EXTRACTION FUNCTION ====
def extract_pdf_content(pdf_path, output_dir, json_name="output.json", charts_dir="charts"):
    """
    Extracts structured content (paragraphs, tables, charts) from a PDF
    and saves as JSON. Chart images are extracted as PNG files.

    pdf_path: path to input PDF
    output_dir: path to save results
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    charts_path = os.path.join(output_dir, charts_dir)
    if not os.path.exists(charts_path):
        os.makedirs(charts_path)

    json_path = os.path.join(output_dir, json_name)

    document_structure = {"pages": []}
    pdf = fitz.open(pdf_path)

    with pdfplumber.open(pdf_path) as plumber_pdf:
        for page_number, page in enumerate(plumber_pdf.pages, start=1):
            page_data = {"page_number": page_number, "content": []}

            # --- Extract Text (Paragraphs) ---
            text = page.extract_text() or ""
            if text.strip():
                sections = split_into_sections(text)
                for sec in sections:
                    page_data["content"].append({
                        "type": "paragraph",
                        "section": sec.get("section"),
                        "sub_section": sec.get("sub_section"),
                        "text": sec.get("content")
                    })

            # --- Extract Tables ---
            try:
                tables = camelot.read_pdf(pdf_path, pages=str(page_number))
                for i, table in enumerate(tables):
                    page_data["content"].append({
                        "type": "table",
                        "section": f"Table {i+1}",
                        "description": None,
                        "table_data": table.df.values.tolist()
                    })
            except Exception:
                pass

            # --- Detect Images (Charts) ---
            page_fitz = pdf[page_number - 1]
            image_list = page_fitz.get_images(full=True)
            if image_list:
                for i, img in enumerate(image_list, start=1):
                    xref = img[0]
                    pix = fitz.Pixmap(pdf, xref)
                    if pix.n > 4:  # Convert CMYK to RGB
                        pix = fitz.Pixmap(fitz.csRGB, pix)
                    chart_filename = f"chart_page{page_number}_{i}.png"
                    chart_path = os.path.join(charts_path, chart_filename)
                    pix.save(chart_path)
                    page_data["content"].append({
                        "type": "chart",
                        "section": f"Chart {i}",
                        "file": chart_filename,
                        "description": "Extracted image (possible chart/figure)"
                    })

            document_structure["pages"].append(page_data)

    # --- Save JSON ---
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(document_structure, f, indent=4, ensure_ascii=False)

    print(f" Extraction complete! JSON saved to {json_path}, charts saved in {charts_path}")


# ==== HELPER FUNCTION ====
def split_into_sections(text):
    """
    Splits text into sections and sub-sections using regex rules.
    """
    sections = []
    lines = text.split("\n")
    current_section = None
    current_sub_section = None
    buffer = []

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Match section headings (1., 2., 3.)
        if re.match(r'^\d+\.\s', line):
            if buffer:
                sections.append({
                    "section": current_section,
                    "sub_section": current_sub_section,
                    "content": " ".join(buffer).strip()
                })
                buffer = []
            current_section = line
            current_sub_section = None

        # Match sub-sections (1.1, 2.3.1, etc.)
        elif re.match(r'^\d+(\.\d+)+\s', line):
            if buffer:
                sections.append({
                    "section": current_section,
                    "sub_section": current_sub_section,
                    "content": " ".join(buffer).strip()
                })
                buffer = []
            current_sub_section = line

        else:
            buffer.append(line)

    if buffer:
        sections.append({
            "section": current_section or "General",
            "sub_section": current_sub_section,
            "content": " ".join(buffer).strip()
        })

    return sections

In [16]:
# Path to my PDF
pdf_file = "/content/[Fund Factsheet - May]360ONE-MF-May 2025.pdf"

# Path of my Google Drive where results will be saved
output_dir = "/content/drive/MyDrive/PDF_Extraction"

# Run the extraction
extract_pdf_content(
    pdf_file,
    output_dir,
    json_name="fund_factsheet.json",
    charts_dir="charts"
)


 Extraction complete! JSON saved to /content/drive/MyDrive/PDF_Extraction/fund_factsheet.json, charts saved in /content/drive/MyDrive/PDF_Extraction/charts
