In [1]:
import json
import os
from typing import List

import fitz
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field

load_dotenv()

True

In [2]:
src_dir = "/workspace/input"
out_dir = "/workspace/output/01"

os.makedirs(out_dir, exist_ok=True)

In [3]:
class SubChapter(BaseModel):
    chapter_no: str = Field(description="Chapter number of the subchapter")
    title: str = Field(description="Title of the subchapter as written in TOC")


class ChapterStructure(BaseModel):
    subchapters: List[SubChapter] = Field(
        description="List of all subchapters with their titles and chapter numbers"
    )

In [4]:
def process_toc(toc, save_path):
    llm = ChatGroq(
        model="llama-3.3-70b-versatile",
        temperature=0.1,
    )

    structured_llm = llm.with_structured_output(ChapterStructure)

    response = structured_llm.invoke(f"""
            You are given the following Table of Contents of an Economic Survey document.
            Extract each subchapter entry with its chapter number and exact title as written in the TOC.

            Rules:
            1. Include ALL subchapter titles under each chapter
            2. Do not summarize or merge entries; return them exactly as written
            3. Include chapter main titles as the first subchapter of each chapter
            4. Maintain the exact spelling and formatting of titles
            5. Skip page numbers, just focus on chapter numbers and titles

            TOC:
            {toc}
    """)

    chapter_structure = {}

    for subchapter in response.subchapters:
        chapter_no = subchapter.chapter_no
        if chapter_no not in chapter_structure:
            chapter_structure[chapter_no] = []
        chapter_structure[chapter_no].append(subchapter.title)

    chapter_structure = str(chapter_structure).replace("'", '"')
    chapter_structure = str(chapter_structure).replace("’", '\'')

    with open(save_path, "w") as f:
        json.dump(
            json.loads(chapter_structure),
            f,
            ensure_ascii=False
        )


In [5]:
for src in os.listdir(src_dir):
    if src.endswith(".pdf"):
        file_name = os.path.splitext(src)[0].replace(" ", "_")
        out_path = os.path.join(out_dir, file_name)
        os.makedirs(out_path, exist_ok=True)
        src = os.path.join(src_dir, src)
        doc = fitz.open(src)

        toc = ""
        for k in range(doc.page_count):
            text = doc[k].get_text()
            if text.startswith("CONTENTS"):
                break

        for m in range(k, 6):
            toc += doc[m].get_text()

        process_toc(toc, os.path.join(out_path, "chapter_structure.json"))

        chapters_pdf = []
        for i in range(doc.page_count):
            text = doc[i].get_text()
            if text.startswith("CHAPTER"):
                chapters_pdf.append(i)
            elif text.startswith("CHAPTER", 3):
                chapters_pdf.append(i)

        for i, start in enumerate(chapters_pdf):
            if i < len(chapters_pdf) - 1:
                end = chapters_pdf[i + 1] - 1
            else:
                end = len(doc) - 1

            temp_doc = fitz.open()
            temp_doc.insert_pdf(doc, from_page=start, to_page=end)
            temp_doc.save(os.path.join(out_path, f"{i + 1}.pdf"))
            temp_doc.close()

        doc.close()