In [None]:
import json
import os
import re
import time
from pathlib import Path
from typing import Dict, List

from dotenv import load_dotenv
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field

load_dotenv()

True

In [None]:
# Initialize LLM
llm = ChatGroq(
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    temperature=0.1,
)


# Pydantic Models
class ChapterSummary(BaseModel):
    summary: str = Field(
        ...,
        description="Comprehensive summary of the chapter retaining all facts and figures",
    )


class SubchapterTags(BaseModel):
    tags: List[str] = Field(
        ..., description="List of 1-3 relevant tags for knowledge graph", max_length=3
    )


class ChunkOutput(BaseModel):
    chapter_no: int = Field(..., description="Chapter number")
    subchapter_no: int = Field(..., description="Subchapter/chunk number")
    content: str = Field(..., description="Chapter summary + subchapter content")
    tags: List[str] = Field(..., description="Generated tags for the subchapter")

In [None]:
def read_file_content(file_path: str) -> str:
    """Read content from a markdown file"""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return ""


def extract_chapter_number(filename: str) -> int:
    """Extract chapter number from filename"""
    match = re.search(r"chapter_(\d+)", filename)
    return int(match.group(1)) if match else 0


def extract_chunk_number(filename: str) -> int:
    """Extract chunk number from filename"""
    match = re.search(r"chunk_(\d+)", filename)
    return int(match.group(1)) if match else 0

In [None]:
def generate_chapter_summary(chapter_content: str) -> str:
    """Generate summary with error handling for 413, 400, 429 using recursion."""

    structured_llm = llm.with_structured_output(ChapterSummary)

    prompt = f"""You are tasked with creating a comprehensive summary of an economic survey chapter.

    Instructions:
    - Retain ALL numerical data, statistics, percentages, and figures mentioned
    - Include key policy recommendations and findings  
    - Maintain the factual accuracy of economic indicators
    - Keep the summary detailed enough to understand the chapter's main points
    - Focus on economic trends, challenges, and outlook presented

    Chapter Content:
    {chapter_content}

    Generate a detailed summary that preserves all important facts and figures.
    """

    try:
        result = structured_llm.invoke(prompt)
        return result.summary

    except Exception as e:
        # Parse Groq error JSON if possible
        try:
            err_obj = json.loads(str(e).split("Error code:", 1)[-1].strip())
        except Exception:
            err_obj = {"error": {"message": str(e)}}

        error_code = err_obj.get("error", {}).get("code", "")
        error_msg = err_obj.get("error", {}).get("message", "")

        MAX_CHARS_SAFE = 30000

        # --- Handle 413: Request too large ---
        if "413" in str(e) or "Request too large" in error_msg:
            print("⚠️ 413 error: input too large. Trimming content and retrying...")
            trimmed_content = chapter_content[:MAX_CHARS_SAFE]
            return generate_chapter_summary(trimmed_content)

        # --- Handle 400: tool_use_failed, return failed_generation ---
        if "400" in str(e) or "tool_use_failed" in error_code:
            failed_text = err_obj.get("error", {}).get("failed_generation")
            if failed_text:
                print("⚠️ 400 error: returning failed_generation output.")
                return failed_text
            return f"Summary generation failed with 400: {error_msg}"

        # --- Handle 429: Rate limit exceeded ---
        if "429" in str(e) or "rate_limit" in error_code:
            print("⚠️ 429 error: rate limit exceeded. Waiting 60s before retry...")
            time.sleep(60)
            return generate_chapter_summary(chapter_content)

        # --- Other errors ---
        print(f"❌ Unhandled error: {error_msg}")
        return f"Summary generation failed: {error_msg}"

In [None]:
def generate_subchapter_tags(subchapter_content: str) -> List[str]:
    """Generate tags for a subchapter"""
    structured_llm = llm.with_structured_output(SubchapterTags)

    prompt = f"""Analyze the provided economic survey subchapter content and generate 1-3 relevant tags.

    Instructions:
    - Tags should be suitable for a knowledge graph
    - Focus on main economic themes, sectors, or concepts
    - Use concise, descriptive terms (e.g., "GDP_Growth", "Inflation", "Manufacturing", "Trade_Policy")
    - Avoid generic tags like "economics" or "data"
    - Prioritize the most specific and relevant topics

    Subchapter Content:
    {subchapter_content}

    Generate up to 3 specific tags that best represent this content for knowledge graph categorization."""

    try:
        result = structured_llm.invoke(prompt)
        return result.tags
    except Exception as e:
        print(f"Error generating tags: {e}")
        return ["processing_error"]

In [None]:
def discover_chapters_and_chunks(base_path: str) -> Dict:
    """Discover all chapters and their chunks"""
    base_path = Path(base_path)
    chapters_data = {}

    # Find all chapter directories
    chapter_dirs = [
        d for d in base_path.iterdir() if d.is_dir() and d.name.startswith("chapter_")
    ]
    chapter_dirs.sort(key=lambda x: extract_chapter_number(x.name))

    for chapter_dir in chapter_dirs:
        chapter_no = extract_chapter_number(chapter_dir.name)

        # Find main chapter file
        chapter_file = chapter_dir / f"chapter_{chapter_no}.md"

        # Find chunk files in this chapter
        chunk_files = [
            f
            for f in chapter_dir.iterdir()
            if f.is_file() and f.name.startswith("chunk_") and f.name.endswith(".md")
        ]
        chunk_files.sort(key=lambda x: extract_chunk_number(x.name))

        chapters_data[chapter_no] = {
            "chapter_file": str(chapter_file) if chapter_file.exists() else None,
            "chunk_files": [
                (extract_chunk_number(f.name), str(f)) for f in chunk_files
            ],
        }

    total_chunks = sum(len(data["chunk_files"]) for data in chapters_data.values())
    print(f"Discovered {total_chunks} chunks across {len(chapters_data)} chapters")

    return chapters_data

In [None]:
def process_chapter_summary(chapter_file_path: str, chapter_no: int) -> str:
    """Process and generate summary for a chapter"""
    if not chapter_file_path or not os.path.exists(chapter_file_path):
        print(f"Warning: Chapter {chapter_no} file not found at {chapter_file_path}")
        return f"Chapter {chapter_no} Summary (file not found)"

    print(f"Generating summary for Chapter {chapter_no}...")
    chapter_content = read_file_content(chapter_file_path)

    if chapter_content:
        chapter_summary = generate_chapter_summary(chapter_content)
        print(
            f"Generated summary for Chapter {chapter_no} ({len(chapter_summary)} chars)"
        )
        return chapter_summary
    else:
        return f"Chapter {chapter_no} Summary (content not readable)"

In [None]:
def process_chunk(
    chunk_no: int, chunk_file_path: str, chapter_no: int, chapter_summary: str
) -> Dict:
    """Process a single chunk"""
    print(f"Processing Chapter {chapter_no}, Chunk {chunk_no}...")

    # Read chunk content
    chunk_content = read_file_content(chunk_file_path)

    if not chunk_content:
        print(f"Warning: Could not read chunk file {chunk_file_path}")
        return None

    # Generate tags for this subchapter
    tags = generate_subchapter_tags(chunk_content)

    # Combine chapter summary with chunk content
    combined_content = (
        f"Chapter {chapter_no} Summary:\n{chapter_summary}\n\n"
        f"Subchapter {chunk_no} Content:\n{chunk_content}"
    )

    # Create chunk output
    chunk_output = {
        "chapter_no": chapter_no,
        "subchapter_no": chunk_no,
        "content": combined_content,
        "tags": tags,
    }

    print(f"Processed chunk {chunk_no} with tags: {tags}")
    return chunk_output

In [None]:
def save_chunk_as_json(chunk_data: Dict, output_dir: str) -> None:
    """Save a single chunk as JSON file"""
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)

    filename = (
        f"chapter_{chunk_data['chapter_no']}_chunk_{chunk_data['subchapter_no']}.json"
    )
    file_path = output_dir / filename

    try:
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(chunk_data, f, indent=2, ensure_ascii=False)
        print(f"Saved: {filename}")
    except Exception as e:
        print(f"Error saving {filename}: {e}")

In [None]:
def process_documents(base_path: str, output_dir: str = "processed_chunks") -> Dict:
    """Main function to process all documents"""
    print("Starting document processing...")

    # Discover all chapters and chunks
    chapters_data = discover_chapters_and_chunks(base_path)

    if not chapters_data:
        print("No chapters found in the specified path")
        return {}

    # Process each chapter
    chapter_summaries = {}
    processed_chunks = []

    for chapter_no in sorted(chapters_data.keys()):
        chapter_info = chapters_data[chapter_no]

        # Generate chapter summary if not already done
        if chapter_no not in chapter_summaries:
            chapter_summaries[chapter_no] = process_chapter_summary(
                chapter_info["chapter_file"], chapter_no
            )

        # Process each chunk in this chapter
        for chunk_no, chunk_file_path in chapter_info["chunk_files"]:
            chunk_data = process_chunk(
                chunk_no, chunk_file_path, chapter_no, chapter_summaries[chapter_no]
            )

            if chunk_data:
                # Save immediately to avoid memory issues with large datasets
                save_chunk_as_json(chunk_data, output_dir)
                processed_chunks.append(
                    {"chapter_no": chapter_no, "chunk_no": chunk_no, "file_saved": True}
                )

    print("\nDocument processing completed!")
    print(
        f"Processed {len(processed_chunks)} chunks from {len(chapters_data)} chapters"
    )
    print(f"Output saved to: {output_dir}")

    return {
        "total_chapters": len(chapters_data),
        "total_chunks": len(processed_chunks),
        "output_directory": output_dir,
        "processed_chunks": processed_chunks,
    }

In [None]:
def generate_processing_summary(result: Dict, output_dir: str) -> None:
    """Generate a summary of processing results"""
    summary_file = Path(output_dir) / "processing_summary.json"

    try:
        with open(summary_file, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        print(f"Processing summary saved to: {summary_file}")
    except Exception as e:
        print(f"Error saving processing summary: {e}")

In [None]:
# if __name__ == "__main__":
#     base_path = "/workspace/output/02"
#     output_directory = "/workspace/output/03/"

#     # Process all documents
#     files = [
#         f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))
#     ]

#     for i in files:
#         print(f"Processing: {i}")
#         result = process_documents(
#             os.path.join(base_path, i), os.path.join(output_directory, i)
#         )
#         generate_processing_summary(result, os.path.join(output_directory, i))

In [13]:
if __name__ == "__main__":
    base_path = "/workspace/output/02/ES_23-24"
    output_directory = "/workspace/output/03/ES_23-24"

    # Process all documents
    result = process_documents(base_path, output_directory)

    # Generate processing summary
    generate_processing_summary(result, output_directory)

Starting document processing...
Discovered 84 chunks across 13 chapters
Generating summary for Chapter 1...
Generated summary for Chapter 1 (2041 chars)
Processing Chapter 1, Chunk 1...
Processed chunk 1 with tags: ['GDP_Growth', 'Economic_Recovery', 'Public_Spending']
Saved: chapter_1_chunk_1.json
Processing Chapter 1, Chunk 2...
Processed chunk 2 with tags: ['Global_Economic_Growth', 'Inflation_Pressures', 'Geopolitical_Risks']
Saved: chapter_1_chunk_2.json
Processing Chapter 1, Chunk 3...
Processed chunk 3 with tags: ['GDP_Growth', 'Manufacturing', 'Investment_Demand']
Saved: chapter_1_chunk_3.json
Processing Chapter 1, Chunk 4...
Processed chunk 4 with tags: ['Fiscal_Consolidation', 'Macroeconomic_Stability', 'Economic_Growth']
Saved: chapter_1_chunk_4.json
Processing Chapter 1, Chunk 5...
Processed chunk 5 with tags: ['Inclusive_Growth', 'Poverty_Reduction', 'Social_Welfare']
Saved: chapter_1_chunk_5.json
Processing Chapter 1, Chunk 6...
Processed chunk 6 with tags: ['GDP_Growth',