In [None]:
import json
import re
from bs4 import BeautifulSoup
from collections import Counter
from tqdm.notebook import tqdm

In [None]:
from pathlib import Path
report_folder = Path('reports/reports')
file_folder = Path('reports/files')

In [None]:
from bs4 import BeautifulSoup

def convert_to_markdown(element):
    """
    Recursively converts a BeautifulSoup element's content to a Markdown string,
    preserving basic formatting like bold, italics, and links.
    """
    text = ''
    # Handle NavigableString (text nodes) which is a subclass of str
    if isinstance(element, str):
        return element

    # Handle tags
    if not hasattr(element, 'contents'):
        return ''

    for child in element.contents:
        if isinstance(child, str):
            text += child
        elif child.name in ['strong', 'b']:
            text += f"**{convert_to_markdown(child)}**"
        elif child.name in ['em', 'i']:
            text += f"*{convert_to_markdown(child)}*"
        elif child.name == 'a':
            link_text = convert_to_markdown(child).strip()
            href = child.get('href', '')
            text += f"[{link_text}]({href})"
        elif child.name == 'span':
            # Spans are often just for styling, so we process their content
            text += convert_to_markdown(child)
        else:
            # For other unexpected tags, just get their text content
            text += child.get_text(strip=True)
    return text

def parse_html_content(html_string):
    """
    Parses an HTML string to extract structural elements like headings, paragraphs, tables, and lists.

    This function uses BeautifulSoup to parse the HTML, ignoring images and scripts.
    It identifies headings, paragraphs, tables, and lists (ul, ol),
    and returns a structured list of these elements. Paragraphs, tables, and lists are converted
    to Markdown format.

    Args:
        html_string (str): A string containing the HTML content to be parsed.

    Returns:
        list: A list of dictionaries, where each dictionary represents a
              structural element found in the HTML. Each element has a 'type'
              and 'content'.
    """
    # Initialize BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(html_string, 'html.parser')

    # This list will hold our structured data
    structured_content = []

    # Find all relevant tags that define the structure of the document
    for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'table', 'ul', 'ol']):
        # Skip elements that are parents of other processed elements to avoid duplication
        if element.find_parent(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table', 'ul', 'ol', 'li']):
             continue

        # Process based on the tag type
        if element.name.startswith('h'):
            structured_content.append({
                'type': f'heading_{element.name[1]}',
                'content': element.get_text(strip=True)
            })
        elif element.name == 'p':
            # Convert the paragraph content to Markdown and remove surrounding whitespace
            markdown_content = convert_to_markdown(element).strip()

            # Only add non-empty paragraphs
            if markdown_content:
                structured_content.append({
                    'type': 'paragraph_markdown',
                    'content': markdown_content
                })
        elif element.name == 'div' and element.get_text(strip=True) and not element.find(['h1', 'h2', 'h3', 'p', 'table', 'ul', 'ol']):
             structured_content.append({
                'type': 'div_text',
                'content': element.get_text(strip=True)
             })
        elif element.name == 'table':
            markdown_table = ""
            rows = element.find_all('tr')
            for i, row in enumerate(rows):
                # Get all cells in the row
                cells = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
                markdown_table += "| " + " | ".join(cells) + " |\n"
                # Add a separator after the header row
                if i == 0:
                    markdown_table += "| " + " | ".join(['---'] * len(cells)) + " |\n"

            structured_content.append({
                'type': 'table_markdown',
                'content': markdown_table
            })
        elif element.name == 'ul':
            markdown_list = ""
            for item in element.find_all('li', recursive=False):
                # Convert list item content to markdown to handle nested formatting
                item_content = convert_to_markdown(item).strip()
                markdown_list += f"- {item_content}\n"

            structured_content.append({
                'type': 'list_markdown',
                'content': markdown_list
            })
        elif element.name == 'ol':
            markdown_list = ""
            for i, item in enumerate(element.find_all('li', recursive=False)):
                 # Convert list item content to markdown to handle nested formatting
                 item_content = convert_to_markdown(item).strip()
                 markdown_list += f"{i+1}. {item_content}\n"

            structured_content.append({
                'type': 'list_markdown',
                'content': markdown_list
            })


    return structured_content

def main():
    """
    Main function to read the HTML file, parse it, and print the result.
    """
    # Example HTML content with formatted text to demonstrate markdown conversion.
    html_doc = """
    <div>
        <h2>Example with Formatting</h2>
        <p>This paragraph has <strong>bold text</strong> and <em>italic text</em>.</p>
        <p>It also has a <a href="https://example.com">link with <i>italic text inside</i></a>.</p>
        <p>Here is another paragraph with <b>bold</b> and <i>italics</i>.</p>
        <ul>
            <li>List item with <strong>bold</strong></li>
            <li>Another list item with <em>italics</em></li>
        </ul>
    </div>
    """

    # Parse the HTML
    parsed_data = parse_html_content(html_doc)

    # Print the structured data in a readable format
    import json
    print(json.dumps(parsed_data, indent=2))

if __name__ == '__main__':
    main()


In [None]:
# Good
'2025-03-18_RL31572_6c7a262e8ca1313647f6c2e2139deb25c85d49ab.html'
"2025-04-01_R48478_6a4277ac5f781f3f0363a6f76e64089ee5fc0041.html"
'2025-05-02_IF10349_8374304583e98220e31d7b89c3913fc5fdc93bef.html'

# Odd
"2024-11-05_IN12456_e089abfacd467aa6fdad2e1c4a49fab3273496ca.html"

# simple
'2024-11-18_LSB11249_4b1d7f66b0477bf585dbf0b394badaf425629f51.html'

with open(file_folder / "2024-11-05_IN12456_e089abfacd467aa6fdad2e1c4a49fab3273496ca.html", 'r') as file:
    html_doc = file.read()
parsed_data = parse_html_content(html_doc)

# Print the structured data in a readable format
import json
print(json.dumps(parsed_data, indent=2))
parsed_data = [x for x in parsed_data if x['content']]
with open('test.json', 'w') as file:
    json.dump(parsed_data, file)

In [None]:
import json

def count_words_in_chunk(chunk):
    """
    Calculates the total number of words in a chunk of content.
    A chunk is a list of dictionaries, where each dictionary has a 'content' key.

    Args:
        chunk (list): A list of content items (dictionaries).

    Returns:
        int: The total word count in the chunk.
    """
    count = 0
    for item in chunk:
        # Split content by whitespace to count words
        count += len(item.get('content', '').split())
    return count

# --- Original Hierarchical Chunker Logic (from the first script) ---
# This part is included to make the example runnable. In a real pipeline,
# you would import this function or run the scripts sequentially.

def hierarchical_chunker_recursive(current_chunk, level, max_words, buffer):
    """
    Recursively splits a chunk of a document into smaller chunks based on headings.
    """
    final_chunks = []
    heading_tag = f'heading_{level}'
    has_headings_at_this_level = any(item['type'] == heading_tag for item in current_chunk)

    if not has_headings_at_this_level:
        return [current_chunk]

    sub_groups = []
    current_group = []
    first_heading_index = -1
    for i, item in enumerate(current_chunk):
        if item['type'] == heading_tag:
            first_heading_index = i
            break

    if first_heading_index > 0:
        initial_group = current_chunk[:first_heading_index]
        word_count = count_words_in_chunk(initial_group)
        if word_count > max_words + buffer:
            deeper_chunks = hierarchical_chunker_recursive(initial_group, level + 1, max_words, buffer)
            final_chunks.extend(deeper_chunks)
        else:
            final_chunks.append(initial_group)
        remaining_items = current_chunk[first_heading_index:]
    else:
        remaining_items = current_chunk

    for item in remaining_items:
        if item['type'] == heading_tag:
            if current_group:
                sub_groups.append(current_group)
            current_group = [item]
        else:
            current_group.append(item)

    if current_group:
        sub_groups.append(current_group)

    for group in sub_groups:
        word_count = count_words_in_chunk(group)
        if word_count <= max_words + buffer:
            final_chunks.append(group)
        else:
            deeper_chunks = hierarchical_chunker_recursive(group, level + 1, max_words, buffer)
            final_chunks.extend(deeper_chunks)

    return final_chunks

def chunk_document(data, max_words, buffer, target_level=1):
    """
    Main function to start the hierarchical chunking process.
    """
    return hierarchical_chunker_recursive(data, 3, max_words, buffer)


# --- Chunk Merging Logic ---

def merge_chunks(chunks, target_word_count):
    """
    Merges smaller chunks together to reach a target word count.

    This function iterates through a list of chunks and combines adjacent ones
    as long as the combined word count does not exceed the target.

    Args:
        chunks (list): A list of chunks, where each chunk is a list of content items.
        target_word_count (int): The desired word count for the merged chunks.

    Returns:
        list: A new list of merged chunks.
    """
    if not chunks:
        return []

    merged_chunks = []
    # Start with the first chunk
    current_merged_chunk = list(chunks[0])
    current_word_count = count_words_in_chunk(current_merged_chunk)

    # Iterate through the rest of the chunks
    for next_chunk in chunks[1:]:
        next_chunk_word_count = count_words_in_chunk(next_chunk)

        # If adding the next chunk doesn't exceed the target, merge it
        if current_word_count + next_chunk_word_count <= target_word_count:
            current_merged_chunk.extend(next_chunk)
            current_word_count += next_chunk_word_count
        else:
            # Otherwise, finalize the current merged chunk
            merged_chunks.append(current_merged_chunk)
            # And start a new one with the next chunk
            current_merged_chunk = list(next_chunk)
            current_word_count = next_chunk_word_count

    # Don't forget to add the last processed chunk
    if current_merged_chunk:
        merged_chunks.append(current_merged_chunk)

    return merged_chunks


# --- Citation Logic ---

def add_citations(chunks, base_citation):
    """
    Adds a unique citation to each passage within each chunk.

    The function iterates through each chunk and each passage, adding a
    'citation' field in the format of "[base_citation_chunk_id__passage_id]".

    Args:
        chunks (list): A list of chunks.
        base_citation (str): The base string to use for citations.

    Returns:
        list: The list of chunks with citations added to each passage.
    """
    cited_chunks = []
    # Enumerate from 1 to get human-readable chunk IDs
    for chunk_id, chunk in enumerate(chunks, 1):
        new_chunk = []
        # Enumerate from 1 for passage IDs within the chunk
        for passage_id, passage in enumerate(chunk, 0):
            # Create a copy to avoid modifying the original data in place.
            # This preserves all existing fields, including 'doc_index'.
            new_passage = passage.copy()
            new_passage['citation'] = f"{base_citation}_{chunk_id}__{passage_id}"
            new_chunk.append(new_passage)
        cited_chunks.append(new_chunk)
    return cited_chunks

def find_highest_heading(passages, min_necessary=1):
    passage_counts = Counter([x['type'] for x in passages])
    possible_headings = [passage for passage, _count in passage_counts.items() if _count > min_necessary]
    possible_headings = [x for x in possible_headings if 'heading' in x]
    possible_headings = [int(x.split('_')[1]) for x in possible_headings]
    return min(possible_headings)

def skip_heading_1(passages, min_necessary=1):
    passage_counts = Counter([x['type'] for x in passages])
    possible_headings = {passage_type:_count for passage_type, _count in passage_counts.items() if 'heading' in passage_type}
    if len(possible_headings) == 0:
        return 0
    if possible_headings.get('heading_1', 0):
        if possible_headings.get('heading_1') > min_necessary:
            return 1
        else:
            _min_header = min([int(passage_type.split('_')[1]) for passage_type in possible_headings.keys()])
            return _min_header
    return -1

if __name__ == '__main__':
    # --- Configuration ---
    JSON_FILE_PATH = 'test.json'
    BASE_CITATION = "RL31572"
    # Config for the initial splitting phase
    INITIAL_MAX_WORDS = 400
    INITIAL_BUFFER = 400
    MINIMUM_NECESSARY_HEADINGS = 1
    # Config for the merging phase
    TARGET_MERGE_WORD_COUNT = 1000

    try:
        # Load the document from the JSON file
        with open(JSON_FILE_PATH, 'r', encoding='utf-8') as f:
            document_data_raw = json.load(f)

        # Add original document index to each passage at the beginning
        document_data = []
        for i, passage in enumerate(document_data_raw):
            passage['doc_index'] = i
            document_data.append(passage)

        # --- Step 1: Perform the initial hierarchical chunking ---
        highest_heading = skip_heading_1(document_data, min_necessary=MINIMUM_NECESSARY_HEADINGS)
        initial_chunks = chunk_document(document_data, INITIAL_MAX_WORDS, INITIAL_BUFFER, target_level=highest_heading)
        print(f"Step 1: Document initially split into {len(initial_chunks)} chunks.\n")

        # --- Step 2: Perform the merging post-processing ---
        final_merged_chunks = merge_chunks(initial_chunks, TARGET_MERGE_WORD_COUNT)
        print("---" * 15)
        print(f"\nStep 2: Merged into {len(final_merged_chunks)} final chunks.")
        print(f"Target Merge Size: {TARGET_MERGE_WORD_COUNT} words\n")

        # --- Step 3: Add citations to the final chunks ---
        final_cited_chunks = add_citations(final_merged_chunks, BASE_CITATION)
        print("---" * 15)
        print(f"\nStep 3: Added citations to {len(final_cited_chunks)} chunks.")
        print(f"Base Citation: '{BASE_CITATION}'\n")

        # --- Output and Verification ---
        print("---" * 15)
        print("\nFinal Output Verification:\n")
        total_word_count = 0
        for i, chunk in enumerate(final_cited_chunks):
            word_count = count_words_in_chunk(chunk)
            total_word_count += word_count

            first_item_type = chunk[0].get('type', 'N/A')
            first_item_content = chunk[0].get('content', '').replace('\n', ' ')[:70]
            first_item_citation = chunk[0].get('citation', 'N/A')
            first_item_doc_index = chunk[0].get('doc_index', -1)

            print(f"--- Final Chunk {i+1} ---")
            print(f"  Word Count: {word_count}")
            print(f"  Items: {len(chunk)}")
            print(f"  Starts with '{first_item_type}': \"{first_item_content}...\"")
            print(f"  First Citation: {first_item_citation}")
            print(f"  First Doc Index: {first_item_doc_index}")
            print()

        print(f"Total word count across all final chunks: {total_word_count}")
        print(f"Total word count in original document: {count_words_in_chunk(document_data)}")

    except FileNotFoundError:
        print(f"Error: The file '{JSON_FILE_PATH}' was not found.")
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from the file '{JSON_FILE_PATH}'.")


In [None]:
final_cited_chunks[0]

In [None]:
def format_chunks(chunks):
    markdown_text = list()
    for chunk in chunks:
        if 'heading' in chunk['type']:
            heading_strength = int(chunk['type'].split('_')[1])
            markdown_text.append("#"* heading_strength + ' ' + chunk['content'] + '\n')
        else:
            markdown_text.append(f"[{chunk['citation']}]\n" + chunk['content'] + f"\n[/{chunk['citation']}]\n")
    return '\n'.join(markdown_text)

In [None]:
print(format_chunks(final_cited_chunks[0]))

In [None]:
import json

def count_words_in_chunk(chunk):
    """
    Calculates the total number of words in a chunk of content.
    A chunk is a list of dictionaries, where each dictionary has a 'content' key.

    Args:
        chunk (list): A list of content items (dictionaries).

    Returns:
        int: The total word count in the chunk.
    """
    count = 0
    for item in chunk:
        # Split content by whitespace to count words
        count += len(item.get('content', '').split())
    return count

def simple_chunker(data, target_word_count):
    """
    Chunks a document by greedily adding passages until a target word count is reached.
    This does not rely on any hierarchical structure.

    Args:
        data (list): The list of passages from the document.
        target_word_count (int): The approximate word count for each chunk.

    Returns:
        list: A list of chunks.
    """
    if not data:
        return []

    chunks = []
    current_chunk = []
    current_word_count = 0

    for passage in data:
        passage_word_count = count_words_in_chunk([passage])

        # If the current chunk is not empty and adding the next passage would
        # exceed the target, finalize the current chunk.
        if current_chunk and (current_word_count + passage_word_count > target_word_count):
            chunks.append(current_chunk)
            # Start a new chunk
            current_chunk = []
            current_word_count = 0

        # Add the passage to the current (or new) chunk.
        current_chunk.append(passage)
        current_word_count += passage_word_count

    # Add the last remaining chunk if it exists.
    if current_chunk:
        chunks.append(current_chunk)

    return chunks

def add_citations(chunks, base_citation):
    """
    Adds a unique citation and preserves the original document index for each passage.

    Args:
        chunks (list): A list of chunks.
        base_citation (str): The base string to use for citations.

    Returns:
        list: The list of chunks with citations added to each passage.
    """
    cited_chunks = []
    # Enumerate from 1 to get human-readable chunk IDs
    for chunk_id, chunk in enumerate(chunks, 1):
        new_chunk = []
        # Enumerate from 1 for passage IDs within the chunk
        for passage_id, passage in enumerate(chunk, 1):
            # Create a copy to preserve all existing fields, including 'doc_index'.
            new_passage = passage.copy()
            new_passage['citation'] = f"{base_citation}_{chunk_id}__{passage_id}"
            new_chunk.append(new_passage)
        cited_chunks.append(new_chunk)
    return cited_chunks


if __name__ == '__main__':
    # --- Configuration ---
    JSON_FILE_PATH = 'test.json'
    BASE_CITATION = "LSB11249" # Base citation for this specific document
    TARGET_CHUNK_WORDS = 637

    try:
        # Load the document from the JSON file
        with open(JSON_FILE_PATH, 'r', encoding='utf-8') as f:
            document_data_raw = json.load(f)

        # --- Step 1: Add original document index to each passage ---
        document_data = []
        for i, passage in enumerate(document_data_raw):
            passage['doc_index'] = i
            document_data.append(passage)
        print(f"Step 1: Added original document index to {len(document_data)} passages.\n")

        # --- Step 2: Perform simple chunking based on word count ---
        initial_chunks = simple_chunker(document_data, TARGET_CHUNK_WORDS)
        print("---" * 15)
        print(f"\nStep 2: Document split into {len(initial_chunks)} chunks.")
        print(f"Target Chunk Size: {TARGET_CHUNK_WORDS} words\n")

        # --- Step 3: Add citations to the final chunks ---
        final_cited_chunks = add_citations(initial_chunks, BASE_CITATION)
        print("---" * 15)
        print(f"\nStep 3: Added citations to {len(final_cited_chunks)} chunks.")
        print(f"Base Citation: '{BASE_CITATION}'\n")

        # --- Output and Verification ---
        print("---" * 15)
        print("\nFinal Output Verification:\n")
        total_word_count = 0
        for i, chunk in enumerate(final_cited_chunks):
            word_count = count_words_in_chunk(chunk)
            total_word_count += word_count

            first_item_type = chunk[0].get('type', 'N/A')
            first_item_content = chunk[0].get('content', '').replace('\n', ' ')[:70]
            first_item_citation = chunk[0].get('citation', 'N/A')
            first_item_doc_index = chunk[0].get('doc_index', -1)

            print(f"--- Final Chunk {i+1} ---")
            print(f"  Word Count: {word_count}")
            print(f"  Items: {len(chunk)}")
            print(f"  Starts with '{first_item_type}': \"{first_item_content}...\"")
            print(f"  First Citation: {first_item_citation}")
            print(f"  First Doc Index: {first_item_doc_index}")
            print()

        print(f"Total word count across all final chunks: {total_word_count}")
        print(f"Total word count in original document: {count_words_in_chunk(document_data)}")

    except FileNotFoundError:
        print(f"Error: The file '{JSON_FILE_PATH}' was not found.")
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from the file '{JSON_FILE_PATH}'.")


In [None]:
final_cited_chunks[1]

In [None]:
from pathlib import Path

In [None]:
save_folder = Path('reports/parsed')
save_folder.mkdir(parents=True, exist_ok=True)

errors_folder = Path('reports/errors')
errors_folder.mkdir(parents=True, exist_ok=True)

In [None]:
json_files = Path('reports/reports')
json_files = [x for x in json_files.glob('*.json')]

In [None]:
with open(json_files[0], 'r', encoding='utf-8') as f:
    data = json.load(f)

In [None]:
def parse_report_metadata(metadata):
    parsed_metadata = dict()
    parsed_metadata['id'] = metadata['id']
    parsed_metadata['type'] = metadata['type']
    parsed_metadata['typeId'] = metadata['typeId']
    parsed_metadata['number'] = metadata['number']
    parsed_metadata['active'] = metadata['active']
    parsed_metadata['source'] = metadata['source']
    parsed_metadata['topics'] = metadata['topics']
    _version_info = metadata['versions'][0]
    parsed_metadata['version_id'] = _version_info['id']
    parsed_metadata['date'] = _version_info['date']
    parsed_metadata['retrieved_date'] = _version_info['retrieved']
    parsed_metadata['title'] = _version_info['title']
    parsed_metadata['summary'] = _version_info['summary']
    parsed_metadata['source_file'] = [x for x in _version_info['formats'] if x['format'] == 'HTML'][0]['filename']
    return parsed_metadata

In [None]:
parse_report_metadata(data)

In [None]:
def load_file(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    try:
        filename = [x for x in data['versions'][0]['formats'] if x['format'] == 'HTML'][0]['filename']
    except IndexError as e:
        return False, json_file.name, ''

    with open(f'reports/{filename}', 'r') as f:
        doc = f.read()
    return True, doc, data

In [None]:
# BASE_CITATION = "RL31572"
# Config for the initial splitting phase
INITIAL_MAX_WORDS = 800
INITIAL_BUFFER = 1300
MINIMUM_NECESSARY_HEADINGS = 1
# Config for the merging phase
TARGET_MERGE_WORD_COUNT = 1500
TARGET_CHUNK_WORDS = 1500

In [None]:
errored_files = list()
re_parse_files = list()
parsed_data = list()
total_chunks = 0
total_parsed_chunks = 0

In [None]:
start_point = len(errored_files) + len(parse_report_metadata(data))
for json_file in tqdm(json_files[start_point:]):
    did_load, html_document, metadata = load_file(json_file)
    if not did_load:
        with open(json_file, 'r', encoding='utf-8') as f:
            _error_data = json.load(f)
        errored_files.append({'filename': json_file.name, 'json':_error_data, 'error':'error_loading'})
        continue

    if len(html_document.strip()) == 0:
        print('no html')
        with open(json_file, 'r', encoding='utf-8') as f:
            _error_data = json.load(f)
        errored_files.append({'filename': json_file.name, 'json':_error_data, 'error':'no_html'})
        continue

    metadata = parse_report_metadata(metadata)
    document_data_raw = parse_html_content(html_document)
    document_data = []
    for i, passage in enumerate(document_data_raw):
        passage['doc_index'] = i
        document_data.append(passage)
    if len(document_data) == 0:
        errored_files.append({'filename': json_file.name, 'json':metadata, 'error':'no_parsed_data'})
        continue
    # --- Step 1: Perform the initial hierarchical chunking ---
    highest_heading = skip_heading_1(document_data, min_necessary=MINIMUM_NECESSARY_HEADINGS)
    if highest_heading > 0:
        initial_chunks = chunk_document(document_data, INITIAL_MAX_WORDS, INITIAL_BUFFER, target_level=highest_heading)
    else:
        initial_chunks = simple_chunker(document_data, TARGET_CHUNK_WORDS)

    total_parsed_chunks += len(initial_chunks)
    # initial_chunks = chunk_document(document_data, INITIAL_MAX_WORDS, INITIAL_BUFFER, target_level=highest_heading)
    # print(f"Step 1: Document initially split into {len(initial_chunks)} chunks.\n")

    # --- Step 2: Perform the merging post-processing ---
    final_merged_chunks = merge_chunks(initial_chunks, TARGET_MERGE_WORD_COUNT)
    total_chunks += len(final_merged_chunks)
    # print("---" * 15)
    # print(f"\nStep 2: Merged into {len(final_merged_chunks)} final chunks.")
    # print(f"Target Merge Size: {TARGET_MERGE_WORD_COUNT} words\n")

    # --- Step 3: Add citations to the final chunks ---
    final_cited_chunks = add_citations(final_merged_chunks, metadata['id'])
    metadata['chunks'] = final_cited_chunks
    metadata['initial_chunks'] = initial_chunks
    parsed_data.append(metadata)
    # print("---" * 15)
    # print(f"\nStep 3: Added citations to {len(final_cited_chunks)} chunks.")
    # print(f"Base Citation: '{BASE_CITATION}'\n")
    #
    # # --- Output and Verification ---
    # print("---" * 15)
    # print("\nFinal Output Verification:\n")
    # total_word_count = 0
    # for i, chunk in enumerate(final_cited_chunks):
    #     word_count = count_words_in_chunk(chunk)
    #     total_word_count += word_count
    #
    #     first_item_type = chunk[0].get('type', 'N/A')
    #     first_item_content = chunk[0].get('content', '').replace('\n', ' ')[:70]
    #     first_item_citation = chunk[0].get('citation', 'N/A')
    #     first_item_doc_index = chunk[0].get('doc_index', -1)
    #
    #     print(f"--- Final Chunk {i+1} ---")
    #     print(f"  Word Count: {word_count}")
    #     print(f"  Items: {len(chunk)}")
    #     print(f"  Starts with '{first_item_type}': \"{first_item_content}...\"")
    #     print(f"  First Citation: {first_item_citation}")
    #     print(f"  First Doc Index: {first_item_doc_index}")
    #     print()
    #
    # print(f"Total word count across all final chunks: {total_word_count}")
    # print(f"Total word count in original document: {count_words_in_chunk(document_data)}")



In [None]:
len(parsed_data), len(errored_files), total_chunks, total_parsed_chunks

In [None]:
from datetime import datetime
run_time = datetime.now().strftime("%Y%m%d_%H%M")
with open(save_folder.joinpath(f'{run_time}_parsed.json'), 'w') as f:
    json.dump(parsed_data, f)
with open(errors_folder.joinpath(f'{run_time}_errors.json'), 'w') as f:
    json.dump(errored_files, f)

In [None]:
parsed_data[-1]

In [None]:
import math

class HierarchicalParser:
    """
    A hierarchical parser to segment a document into roughly equivalent chunks
    while respecting the document's structure and generating citations.
    """

    def __init__(self, document, base_citation="doc", max_chunk_size=500):
        """
        Initializes the parser with the document, base citation, and chunk size.

        Args:
            document (list): A list of dictionaries, where each dictionary
                             has a 'type' (str) and 'content' (str).
            base_citation (str): The base string for generating citations.
            max_chunk_size (int): The target maximum size for each chunk in words.
        """
        self.document = document
        self.base_citation = base_citation
        self.max_chunk_size = max_chunk_size
        # Defines the hierarchy of passage types from most to least important.
        self.hierarchy = ['header_2', 'header_3', 'header_4', 'paragraph']
        self.passage_type_map = {ptype: i for i, ptype in enumerate(self.hierarchy)}

    def _get_passage_level(self, passage_type):
        """
        Gets the hierarchical level of a passage type. Handles variations
        like 'paragraph_markdown' by checking for substrings.
        """
        if passage_type in self.passage_type_map:
            return self.passage_type_map[passage_type]
        for key, value in self.passage_type_map.items():
            if key in passage_type:
                return value
        return len(self.hierarchy)

    def _split_text(self, text, num_chunks):
        """Splits a single text into a specified number of smaller chunks based on word count."""
        if num_chunks <= 1:
            return [text]
        words = text.split()
        if not words:
            return []
        chunk_size = math.ceil(len(words) / num_chunks)
        if chunk_size == 0:
            return [text]
        return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

    def parse(self):
        """
        Parses the document and splits it into chunks with citations.

        Returns:
            list: A list of chunks, where each chunk is a dictionary containing
                  the 'content', 'hierarchy', and citation information.
        """
        chunks = []
        current_chunk_passages = []
        current_chunk_size = 0
        chunk_counter = 0

        current_hierarchy_texts = []
        current_hierarchy_levels = []

        def finalize_chunk(passages, hierarchy):
            """Helper function to format and add a chunk to the main list."""
            nonlocal chunk_counter
            if not passages:
                return

            chunk_counter += 1
            content_with_citations = []
            for i, passage in enumerate(passages):
                citation = f"[{self.base_citation}_{chunk_counter}__{i+1}]"
                content_with_citations.append({
                    'type': passage['type'],
                    'content': passage['content'],
                    'citation': citation
                })

            chunks.append({
                'hierarchy': list(hierarchy),
                'content': content_with_citations
            })

        for i, passage in enumerate(self.document):
            passage_type = passage['type']
            text = passage['content']
            passage_size = len(text.split())
            passage_level = self._get_passage_level(passage_type)

            # --- Special handling for single passages larger than the chunk size ---
            if passage_size > self.max_chunk_size:
                finalize_chunk(current_chunk_passages, current_hierarchy_texts)
                current_chunk_passages, current_chunk_size = [], 0

                if 'header' in passage_type:
                    while current_hierarchy_levels and current_hierarchy_levels[-1] >= passage_level:
                        current_hierarchy_levels.pop()
                        current_hierarchy_texts.pop()
                    current_hierarchy_levels.append(passage_level)
                    current_hierarchy_texts.append(text)

                num_sub_chunks = math.ceil(passage_size / self.max_chunk_size)
                split_texts = self._split_text(text, num_sub_chunks)
                for sub_text in split_texts:
                    finalize_chunk([{'type': passage_type, 'content': sub_text}], current_hierarchy_texts)
                continue

            # --- Simplified logic to decide when to finalize a chunk ---
            start_new_chunk = False
            if current_chunk_passages:
                # Condition 1: New header at same or higher level creates a new chunk.
                last_header_level = current_hierarchy_levels[-1] if current_hierarchy_levels else len(self.hierarchy)
                is_new_section = 'header' in passage_type and passage_level <= last_header_level

                # Condition 2: Chunk size overflow.
                exceeds_size = (current_chunk_size + passage_size) > self.max_chunk_size

                if is_new_section or exceeds_size:
                    start_new_chunk = True

            if start_new_chunk:
                passages_to_finalize = list(current_chunk_passages)
                carry_over_passages = []

                # Prevent chunk from ending on a header by carrying them to the next chunk.
                while passages_to_finalize and 'header' in passages_to_finalize[-1]['type']:
                    carry_over_passages.insert(0, passages_to_finalize.pop())

                # Determine the hierarchy for the chunk being finalized.
                hierarchy_for_finalized_chunk = list(current_hierarchy_texts)
                if carry_over_passages:
                    num_carried = len(carry_over_passages)
                    hierarchy_for_finalized_chunk = hierarchy_for_finalized_chunk[:-num_carried]

                finalize_chunk(passages_to_finalize, hierarchy_for_finalized_chunk)

                # The carried-over passages start the new chunk.
                current_chunk_passages = carry_over_passages
                current_chunk_size = sum(len(p['content'].split()) for p in current_chunk_passages)

            # Add the current passage to the chunk and update hierarchy
            if 'header' in passage_type:
                while current_hierarchy_levels and current_hierarchy_levels[-1] >= passage_level:
                    current_hierarchy_levels.pop()
                    current_hierarchy_texts.pop()
                current_hierarchy_levels.append(passage_level)
                current_hierarchy_texts.append(text)

            current_chunk_passages.append(passage)
            current_chunk_size += passage_size

        # Finalize any remaining passages in the last chunk.
        finalize_chunk(current_chunk_passages, current_hierarchy_texts)

        return chunks

# --- Example Usage ---
if __name__ == '__main__':
    # A sample document to test section changes and splitting.
    sample_document = [
        {'type': 'header_2', 'content': 'Section 1: The First Topic'},
        {'type': 'paragraph', 'content': 'This is content within the first section. It introduces the main ideas.'},
        {'type': 'header_3', 'content': 'Subsection 1.1'},
        {'type': 'paragraph', 'content': 'Content for subsection 1.1. This provides more detail.'},
        {'type': 'header_3', 'content': 'Subsection 1.2'},
        {'type': 'paragraph', 'content': 'Content for subsection 1.2. A new chunk should start here because we have a new header at the same level as the previous one.'},
        {'type': 'header_2', 'content': 'Section 2: A Higher-Level Change'},
        {'type': 'paragraph', 'content': 'This content belongs to Section 2. A new chunk should have been created for this section because its header is a higher level than "Subsection 1.2".'},
        {'type': 'paragraph', 'content': 'This is a very long paragraph designed to test the chunking mechanism when a single passage might be larger than the chunk size or push the current chunk over the limit. It continues to describe the topic in great detail, forcing a split based on size rather than on a header change. The hierarchy should be preserved across the split.'},
        {'type': 'header_3', 'content': 'Subsection 2.1'}
    ]

    # Initialize the parser with a small chunk size to demonstrate the logic.
    parser = HierarchicalParser(sample_document, base_citation="TestDoc", max_chunk_size=75)

    # Get the chunks
    document_chunks = parser.parse()

    # Print the results
    print(f"Document was split into {len(document_chunks)} chunks.\n")
    for i, chunk_data in enumerate(document_chunks):
        content = chunk_data['content']
        hierarchy = chunk_data['hierarchy']
        chunk_size = sum(len(passage['content'].split()) for passage in content)

        print(f"--- Chunk {i+1} (Size: {chunk_size} words) ---")
        print(f"  Hierarchy: {hierarchy}")
        for passage in content:
            print(f"    {passage['citation']} [{passage['type']}] {passage['content'][:80]}...")
        print("\n")


In [None]:
document_chunks[3]