In [3]:
# Install required libraries
!pip install PyMuPDF spacy tqdm



You should consider upgrading via the 'C:\Users\prana\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


You should consider upgrading via the 'C:\Users\prana\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [5]:
import fitz  # PyMuPDF
import spacy
import re
import json
from tqdm import tqdm

# Load Spacy NER model
nlp = spacy.load("en_core_web_sm")

In [6]:
# To Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    page_texts = []
    for page in doc:
        text = page.get_text()
        full_text += text + "\n"
        page_texts.append(text)
    return full_text, page_texts

In [7]:
def segment_document(text):
    pattern = re.compile(r"\n?\s*(\d+)\.\s+([^\n]+)")
    matches = list(pattern.finditer(text))
    segments = []

    for i, match in enumerate(matches):
        segment_title = match.group(2).strip()
        start_idx = match.start()
        end_idx = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        segment_text = text[start_idx:end_idx].strip()

        segments.append({
            "segment_level": i+1,
            "segment_title": segment_title,
            "segment_text": segment_text,
            "segment_date": extract_date(segment_text),
            #"segment_source": extract_source(segment_text),
            "start_index": start_idx,
            "end_index": end_idx,
            "named_entities": extract_named_entities(segment_text),
        })

    return segments


In [8]:
# Named Entity Recognition
def extract_named_entities(segment_text):
    doc = nlp(segment_text)
    entities = {"persons": [], "organizations": [], "locations": [], "dates": []}

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities["persons"].append(ent.text)
        elif ent.label_ == "ORG":
            entities["organizations"].append(ent.text)
        elif ent.label_ == "GPE":
            entities["locations"].append(ent.text)
        elif ent.label_ == "DATE":
            entities["dates"].append(ent.text)

    return {k: list(set(v)) for k, v in entities.items()}

In [9]:
# For extracting date
def extract_date(text):
    dates = re.findall(r"\b(?:\d{4}|\d{2}/\d{2}/\d{4})\b", text)
    return dates[0] if dates else None

In [10]:
# Full Pipeline
def process_pdf(pdf_path):
    full_text, _ = extract_text_from_pdf(pdf_path)
    segments = segment_document(full_text)

    for seg in tqdm(segments, desc="Processing segments"):
        seg["named_entities"] = extract_named_entities(seg["segment_text"])

    return segments

In [11]:
import json 

if __name__ == "__main__":
    pdf_path = r"D:\sample_unique_multipage_report.pdf"
    output_segments = process_pdf(pdf_path)

    # Print result as formatted JSON to terminal
    print(json.dumps(output_segments, indent=4, ensure_ascii=False))


Processing segments: 100%|███████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 57.48it/s]

[
    {
        "segment_level": 1,
        "segment_title": "Market Overview",
        "segment_text": "1. Market Overview\nMarket Overview prepared by Research Team A highlights critical developments in the sector.\nKey data was collected from international institutions such as World Bank. Primary analysis was led\nby Alice Johnson, \nwith support from on-ground teams in New York. This segment consolidates relevant findings up to\n2022-11-01, \npresenting an actionable summary for stakeholders.",
        "segment_date": "2022",
        "start_index": 22,
        "end_index": 413,
        "named_entities": {
            "persons": [
                "Alice Johnson"
            ],
            "organizations": [
                "Research Team",
                "World Bank"
            ],
            "locations": [
                "New York"
            ],
            "dates": [
                "2022-11-01"
            ]
        }
    },
    {
        "segment_level": 2,
        "segment_




In [12]:
# We can convert Output to JSON
def save_to_json(segments, output_path="output.json"):
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(segments, f, indent=4, ensure_ascii=False)

# Run the pipeline
if __name__ == "__main__":
    pdf_path = "D:\sample_unique_multipage_report.pdf"
    output_segments = process_pdf(pdf_path)
    save_to_json(output_segments)


Processing segments: 100%|███████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 59.71it/s]
