In [None]:
import json
from typing import List

import fitz
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field

load_dotenv()

In [None]:
toc = ""
doc = fitz.open("/workspace/input/ES 22-23.pdf")

for i in range(doc.page_count):
    text = doc[i].get_text()
    if text.startswith("CONTENTS"):
        break

for j in range(i, min(i + 6, doc.page_count)):
    toc += doc[j].get_text()

doc.close()

In [None]:
class SubChapter(BaseModel):
    chapter_no: str = Field(description="Chapter number of the subchapter")
    title: str = Field(description="Title of the subchapter as written in TOC")


class ChapterStructure(BaseModel):
    subchapters: List[SubChapter] = Field(
        description="List of all subchapters with their titles and chapter numbers"
    )

In [None]:
llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0.1,
)

structured_llm = llm.with_structured_output(ChapterStructure)

response = structured_llm.invoke(f"""
        You are given the following Table of Contents of an Economic Survey document.
        Extract each subchapter entry with its chapter number and exact title as written in the TOC.

        Rules:
        1. Include ALL subchapter titles under each chapter
        2. Do not summarize or merge entries; return them exactly as written
        3. Include chapter main titles as the first subchapter of each chapter
        4. Maintain the exact spelling and formatting of titles
        5. Skip page numbers, just focus on chapter numbers and titles

        TOC:
        {toc}
""")

In [None]:
chapter_structure = {}

for subchapter in response.subchapters:
    chapter_no = subchapter.chapter_no
    if chapter_no not in chapter_structure:
        chapter_structure[chapter_no] = []
    chapter_structure[chapter_no].append(subchapter.title)

chapter_structure = str(chapter_structure).replace("'", '"')
json.dump(
    json.loads(chapter_structure), open("/workspace/output/chapter_structure.json", "w")
)