In [None]:
import json
import os
import re

from docling.document_converter import DocumentConverter
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
# Initialize document converter
converter = DocumentConverter()

# Configuration
BASE_PATH = "/workspace/output/01"  # Change this to your base folder path
OUTPUT_PATH = "/workspace/output/02"  # Output path for chunks
# OUTPUT_PATH = "/workspace/output/"  # Output path for chunks

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_PATH, exist_ok=True)

In [None]:
def load_chapter_structure(folder_path):
    """Load chapter structure from JSON file."""
    json_file_path = os.path.join(folder_path, "chapter_structure.json")
    if os.path.exists(json_file_path):
        with open(json_file_path, "r", encoding="utf-8") as f:
            chapter_structure = json.load(f)
        return chapter_structure
    else:
        print(f"Chapter structure file not found in {folder_path}")
        return None


def convert_pdf_to_markdown(pdf_path):
    """Convert PDF to markdown content."""
    try:
        pdf = converter.convert(pdf_path).document
        return pdf.export_to_markdown()
    except Exception as e:
        print(f"Error converting {pdf_path}: {str(e)}")
        return None


def split_content_by_headings(content, headings):
    """Split content into chunks based on markdown headings (## ...)."""
    chunks = {}

    # Find all heading positions
    heading_positions = []
    for heading in headings:
        # Match only if heading starts with ##
        pattern = r"^##\s*" + re.escape(heading) + r"\s*$"
        match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
        if match:
            heading_positions.append((heading, match.start()))

    # Sort by position
    heading_positions.sort(key=lambda x: x[1])

    # Create chunks
    for i, (heading, start_pos) in enumerate(heading_positions):
        if i + 1 < len(heading_positions):
            end_pos = heading_positions[i + 1][1]
        else:
            end_pos = len(content)

        chunk_content = content[start_pos:end_pos].strip()
        chunks[heading] = chunk_content

    return chunks


def save_chunks(
    chunks, output_folder, chapter_num, full_chapter_content=None, generate_summary=True
):
    """Save chunks to separate files and optionally generate chapter summary."""
    chapter_output_folder = os.path.join(output_folder, f"chapter_{chapter_num}")
    os.makedirs(chapter_output_folder, exist_ok=True)

    chunk_info = {}

    # Save individual chunks
    for i, (heading, content) in enumerate(chunks.items(), 1):
        # Create safe filename
        safe_filename = re.sub(r"[^\w\s-]", "", heading)
        safe_filename = re.sub(r"[-\s]+", "_", safe_filename)
        filename = f"chunk_{i:02d}_{safe_filename[:50]}.md"

        # Save chunk
        chunk_path = os.path.join(chapter_output_folder, filename)
        with open(chunk_path, "w", encoding="utf-8") as f:
            f.write(f"# {heading}\n\n")
            f.write(content)

        chunk_info[heading] = {
            "filename": filename,
            "word_count": len(content.split()),
            "char_count": len(content),
        }

    # Save chunk info
    info_path = os.path.join(chapter_output_folder, "chunk_info.json")
    with open(info_path, "w", encoding="utf-8") as f:
        json.dump(chunk_info, f, indent=2, ensure_ascii=False)

    return chunk_info

In [None]:
def process_folder(folder_path):
    """Process a single folder containing PDFs and chapter structure."""
    folder_name = os.path.basename(folder_path)
    print(f"\n=== Processing folder: {folder_name} ===")

    # Load chapter structure
    chapter_structure = load_chapter_structure(folder_path)
    if not chapter_structure:
        return None

    results = {}

    # Process each chapter
    for chapter_num, headings in chapter_structure.items():
        pdf_path = os.path.join(folder_path, f"{chapter_num}.pdf")

        if not os.path.exists(pdf_path):
            print(f"PDF not found: {pdf_path}")
            continue

        print(f"\nProcessing Chapter {chapter_num}...")
        print(f"Headings to extract: {len(headings)}")

        # Convert PDF to markdown
        content = convert_pdf_to_markdown(pdf_path)
        if not content:
            print(f"Failed to convert PDF: {pdf_path}")
            continue
        print(f"Total content length: {len(content)} characters")

        # Split into chunks
        chunks = split_content_by_headings(content, headings)
        print(f"Successfully extracted {len(chunks)} chunks")

        if len(chunks) == 0:
            print(
                f"No chunks found for Chapter {chapter_num}. Check if headings match the PDF content."
            )
            continue

        # Save chunks with optional summary
        output_folder = os.path.join(OUTPUT_PATH, folder_name)
        chunk_info = save_chunks(chunks, output_folder, chapter_num, content)

        # Save Markdown content
        markdown_folder = os.path.join(
            OUTPUT_PATH, folder_name, f"chapter_{chapter_num}"
        )
        markdown_file = os.path.join(markdown_folder, f"chapter_{chapter_num}.md")
        with open(markdown_file, "w", encoding="utf-8") as f:
            f.write(content)

        # Track results
        has_summary = "chapter_summary" in chunk_info and "error" not in chunk_info.get(
            "chapter_summary", {}
        )
        results[chapter_num] = {
            "total_chunks": len(chunks),
            "chunk_info": chunk_info,
            "has_summary": has_summary,
        }

        # Print chunk statistics
        if has_summary:
            summary_word_count = chunk_info["chapter_summary"].get("word_count", 0)
            print(f"    Chapter summary: {summary_word_count} words")

        for heading, info in chunk_info.items():
            if heading != "chapter_summary":
                print(
                    f"  - {heading[:60]}{'...' if len(heading) > 60 else ''}: {info['word_count']} words"
                )

    return results


def process_all_folders(base_path):
    """Process all folders in the base path."""
    if not os.path.exists(base_path):
        print(f"Base path does not exist: {base_path}")
        return {}

    all_results = {}
    folders = [
        item
        for item in os.listdir(base_path)
        if os.path.isdir(os.path.join(base_path, item))
    ]

    if not folders:
        print(f"No folders found in: {base_path}")
        return {}

    print(f"Found {len(folders)} folders to process: {folders}")

    for item in folders:
        folder_path = os.path.join(base_path, item)
        try:
            results = process_folder(folder_path)
            if results:
                all_results[item] = results
        except Exception as e:
            print(f"Error processing folder {item}: {str(e)}")

    return all_results

In [None]:
# # Example: Process a single folder
# single_folder_name = "ES_24-25"  # Change this to your folder name
# single_folder_path = os.path.join(BASE_PATH, single_folder_name)

# if os.path.exists(single_folder_path):
#     print(f"Processing single folder: {single_folder_name}")
#     results = process_folder(single_folder_path)
#     if results:
#         print("\n=== Processing Complete ===")
#         print(f"Results saved to: {OUTPUT_PATH}")
#         print(f"Processed {len(results)} chapters")
#     else:
#         print("No results generated - check your folder structure and files")
# else:
#     print(f"Folder not found: {single_folder_path}")
#     print(f"Available folders in {BASE_PATH}:")
#     if os.path.exists(BASE_PATH):
#         available_folders = [
#             f
#             for f in os.listdir(BASE_PATH)
#             if os.path.isdir(os.path.join(BASE_PATH, f))
#         ]
#         for folder in available_folders:
#             print(f"  - {folder}")
#     else:
#         print(f"Base path does not exist: {BASE_PATH}")

In [None]:
# Process all folders in the base path
print("Processing all folders...")
all_results = process_all_folders(BASE_PATH)

if all_results:
    # Save overall summary
    summary_path = os.path.join(OUTPUT_PATH, "processing_summary.json")
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)

    print("\n=== All Processing Complete ===")
    print(f"Summary saved to: {summary_path}")
    print(f"Processed {len(all_results)} folders")
else:
    print("No folders were successfully processed")

Processing all folders...
Found 3 folders to process: ['ES_22-23', 'ES_23-24', 'ES_24-25']

=== Processing folder: ES_22-23 ===

Processing Chapter 1...
Headings to extract: 6
Total content length: 61913 characters
Successfully extracted 6 chunks
  - State of the Economy 2022-23: Recovery Complete: 1143 words
  - The Global Economy Battles Through a Unique Set of Challenge...: 2226 words
  - Macroeconomic and Growth Challenges in the Indian Economy: 749 words
  - India's Economic Resilience and Growth Drivers: 1828 words
  - India's Inclusive Growth: 868 words
  - Outlook: 2023-24: 735 words

Processing Chapter 2...
Headings to extract: 6
Total content length: 57778 characters
Successfully extracted 5 chunks
  - India's Medium-term Growth Outlook: With Optimism and Hope: 330 words
  - Introduction: 276 words
  - Product and Capital Market Reforms: 4807 words
  - Returns to the Economic and Structural Reforms after 2014: 1075 words
  - Growth Magnets in this Decade (2023-2030): 1194 wor