In [None]:
import re

from docling.document_converter import DocumentConverter
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field

load_dotenv()

In [None]:
in_path = "/workspace/output"
converter = DocumentConverter()


class ChapterSummary(BaseModel):
    summary: str = Field(description="Summary of the chapter content")


llm = ChatGroq(
    # model="llama-3.3-70b-versatile",
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    temperature=0.1,
)

In [None]:
def generate_chapter_summary(chapter_content):
    structured_llm = llm.with_structured_output(ChapterSummary)
    response = structured_llm.invoke(f"""

    Summarize the given chapter of the document using 400-500 words, include all important metrics:

    {chapter_content}

    """)

    return response.summary

In [None]:
temp_json = {
    "1": [
        "State of the Economy 2022-23: Recovery Complete",
        "The Global Economy Battles Through a Unique Set of Challenges",
        "Macroeconomic and Growth Challenges in the Indian Economy",
        "India\u2019s Economic Resilience and Growth Drivers",
        "India\u2019s Inclusive Growth",
        "Outlook: 2023-24",
    ],
}

temp_json = {
    key: [re.sub(r"\u2019", "'", item) for item in value]
    for key, value in temp_json.items()
}

In [None]:
pdf = converter.convert("/workspace/output/ES_22-23/1.pdf").document
chapter_content = pdf.export_to_markdown()

In [None]:
summary = generate_chapter_summary(chapter_content)
print(summary)

In [None]:
chapter_content

In [None]:
import re

split_content = {}
for heading in temp_json["1"]:
    match = re.search(re.escape(heading), chapter_content, re.IGNORECASE | re.MULTILINE)
    if match:
        start_index = match.start()
        split_content[heading] = chapter_content[start_index:]

# Print length of each value in the dict
for heading, content in split_content.items():
    print(f"Heading: {heading}\nLength: {len(content)} characters\n")

In [None]:
# for src in os.listdir(in_path):
#     json_file_path = os.path.join(in_path, src, "chapter_structure.json")
#     if os.path.exists(json_file_path):
#         with open(json_file_path) as f:
#             chapter_structure = json.load(f)

#     for i in chapter_structure.keys():
#         chapter_path = os.path.join(in_path, src, f"{i}.pdf")
#         if os.path.exists(chapter_path):
#             pdf = converter.convert(chapter_path).document
#             chapter_content = pdf.export_to_markdown()
#             summary = generate_chapter_summary(chapter_content)