In [None]:
import json
import re
from bs4 import BeautifulSoup
from collections import Counter
from tqdm.notebook import tqdm

In [None]:
from pathlib import Path
# crs_folder = Path('/Users/jameslittiebrant/Data/crs_reports/reports')
report_folder = Path('/Users/jameslittiebrant/Data/crs_reports/reports')
file_folder = Path('/Users/jameslittiebrant/Data/crs_reports/files')
parsed_folder = Path('/Users/jameslittiebrant/Data/crs_reports/parsed')
parsed_folder.mkdir(exist_ok=True, parents=True)
testing_folder = Path('testing')
testing_folder.mkdir(parents=True, exist_ok=True)

In [None]:
import hashlib
from datetime import datetime

def hash_text_to_id(source_string: str, n_digits: int = 7) -> str:
    hash_object = hashlib.sha256(source_string.encode())
    hex_digest = hash_object.hexdigest()
    hash_int = int(hex_digest, 16)
    numeric_id = hash_int % (10**n_digits)
    return f"{numeric_id:0{n_digits}d}"

In [None]:
from bs4 import BeautifulSoup

def convert_to_markdown(element):
    """
    Recursively converts a BeautifulSoup element's content to a Markdown string,
    preserving basic formatting like bold, italics, and links.
    """
    text = ''
    # Handle NavigableString (text nodes) which is a subclass of str
    if isinstance(element, str):
        return element

    # Handle tags
    if not hasattr(element, 'contents'):
        return ''

    for child in element.contents:
        if isinstance(child, str):
            text += child
        elif child.name in ['strong', 'b']:
            text += f"**{convert_to_markdown(child)}**"
        elif child.name in ['em', 'i']:
            text += f"*{convert_to_markdown(child)}*"
        elif child.name == 'a':
            link_text = convert_to_markdown(child).strip()
            href = child.get('href', '')
            text += f"[{link_text}]({href})"
        elif child.name == 'span':
            # Spans are often just for styling, so we process their content
            text += convert_to_markdown(child)
        else:
            # For other unexpected tags, just get their text content
            text += child.get_text(strip=True)
    return text

def parse_html_content(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    structured_content = []
    for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'table', 'ul', 'ol']):
        if element.find_parent(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table', 'ul', 'ol', 'li']):
             continue

        # Process based on the tag type
        if element.name.startswith('h'):
            structured_content.append({
                'type': f'heading_{element.name[1]}',
                'content': element.get_text(strip=True),
                'id': f"element-{hash_text_to_id(element.get_text(strip=True))}"
            })
        elif element.name == 'p':
            # Convert the paragraph content to Markdown and remove surrounding whitespace
            markdown_content = convert_to_markdown(element).strip()

            # Only add non-empty paragraphs
            if markdown_content:
                structured_content.append({
                    'type': 'paragraph_markdown',
                    'content': markdown_content,
                'id': f"element-{hash_text_to_id(markdown_content)}"
                })
        elif element.name == 'div' and element.get_text(strip=True) and not element.find(['h1', 'h2', 'h3', 'p', 'table', 'ul', 'ol']):
             structured_content.append({
                'type': 'div_text',
                'content': element.get_text(strip=True),
                'id': f"element-{hash_text_to_id(element.get_text(strip=True))}"
             })
        elif element.name == 'table':
            markdown_table = ""
            rows = element.find_all('tr')
            for i, row in enumerate(rows):
                # Get all cells in the row
                cells = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
                markdown_table += "| " + " | ".join(cells) + " |\n"
                # Add a separator after the header row
                if i == 0:
                    markdown_table += "| " + " | ".join(['---'] * len(cells)) + " |\n"

            structured_content.append({
                'type': 'table_markdown',
                'content': markdown_table,
                'id': f"element-{hash_text_to_id(markdown_table)}"
            })
        elif element.name == 'ul':
            markdown_list = ""
            for item in element.find_all('li', recursive=False):
                # Convert list item content to markdown to handle nested formatting
                item_content = convert_to_markdown(item).strip()
                markdown_list += f"- {item_content}\n"

            structured_content.append({
                'type': 'list_markdown',
                'content': markdown_list,
                'id': f"element-{hash_text_to_id(markdown_list)}"
            })
        elif element.name == 'ol':
            markdown_list = ""
            for i, item in enumerate(element.find_all('li', recursive=False)):
                 # Convert list item content to markdown to handle nested formatting
                 item_content = convert_to_markdown(item).strip()
                 markdown_list += f"{i+1}. {item_content}\n"

            structured_content.append({
                'type': 'list_markdown',
                'content': markdown_list,
                'id': f"element-{hash_text_to_id(markdown_list)}"
            })

    return structured_content

In [None]:
file_folder.absolute()

In [None]:
# Good
'2025-03-18_RL31572_6c7a262e8ca1313647f6c2e2139deb25c85d49ab.html'
"2025-04-01_R48478_6a4277ac5f781f3f0363a6f76e64089ee5fc0041.html"
'2025-05-02_IF10349_8374304583e98220e31d7b89c3913fc5fdc93bef.html'

# Odd
"2024-11-05_IN12456_e089abfacd467aa6fdad2e1c4a49fab3273496ca.html"

# simple
'2024-11-18_LSB11249_4b1d7f66b0477bf585dbf0b394badaf425629f51.html'

with open(report_folder / "files/2025-03-18_RL31572_6c7a262e8ca1313647f6c2e2139deb25c85d49ab.html", 'r') as file:
    html_doc = file.read()
parsed_data = parse_html_content(html_doc)

# Print the structured data in a readable format
import json
print(json.dumps(parsed_data, indent=2))
parsed_data = [x for x in parsed_data if x['content']]
with open('test.json', 'w') as file:
    json.dump(parsed_data, file)

In [None]:
all_reports = list(report_folder.glob('*.json'))

In [None]:
report_folder.absolute()

In [None]:
def load_report(report):
    formats = report['versions'][0]['formats']
    file = [x for x in formats if x['format'] == 'HTML']
    if file:
        return file[0]['filename']
    else:
        return ''

def get_report_type(version):
    formats = version[0]['formats']
    file_formats = [x['format'] for x in formats]
    if 'HTML' not in file_formats:
        return False
    else:
        return True

In [None]:
import json

def count_words_in_chunk(chunk):
    """
    Simple "word" counter by splitting on white space

    Args:
        chunk (list): A list of content items (dictionaries).

    Returns:
        int: The total word count in the chunk.
    """
    count = 0
    for item in chunk:
        # Each chunk is composed of html elements
        # Split content by whitespace to count words

        count += len(item.get('content', '').split(' '))
    return count

def hierarchical_chunker_recursive(current_chunk, level, max_words, buffer):
    """
    A chunk is a list of sections which are html elements such as heading, p, and table
    buffer here is to allow small increases within a tolerance
    """
    final_chunks = []
    heading_tag = f'heading_{level}'

    # 1. Find all indices of headings that will act as split points.
    split_indices = [i for i, item in enumerate(current_chunk) if item['type'] == heading_tag]

    # Base Case: If there are no headings at this level, we can't split further.
    if not split_indices:
        return [current_chunk]

    # 2. Create all sub-groups based on the split points.
    #    This is the streamlined logic.

    # Create a list of all boundaries: start of document, all headings, and end of document.
    all_boundaries = split_indices + [len(current_chunk)]
    if split_indices[0] > 0:
        all_boundaries.insert(0, 0)

    # Create sub-groups by slicing between adjacent boundaries using zip.
    sub_groups = [current_chunk[start:end] for start, end in zip(all_boundaries, all_boundaries[1:])]

    # 3. Process each sub-group.
    for group in sub_groups:
        if not group:
            continue

        word_count = count_words_in_chunk(group)
        if word_count <= max_words + buffer:
            # If the group is small enough, it's a final chunk.
            final_chunks.append(group)
        else:
            # If the group is too large, recurse deeper with a step up in the level to identify any sub-sections
            deeper_chunks = hierarchical_chunker_recursive(group, level + 1, max_words, buffer)
            final_chunks.extend(deeper_chunks)

    return final_chunks

def chunk_document(data, max_words, buffer, target_level=1):
    """
    This processes the entire document if needed.
    """
    return hierarchical_chunker_recursive(data, target_level, max_words, buffer)


In [None]:
# --- Chunk Merging Logic ---
def merge_chunks(chunks, target_word_count, last_section_buffer=200):
    """
    Merges smaller chunks together to reach a target word count, with special
    handling for small trailing chunks.

    Args:
        chunks (list): A list of chunks, where each chunk is a list of content items.
        target_word_count (int): The desired word count for the merged chunks.
        last_section_buffer (int): If the final chunk is smaller than this word count,
                                   it will be merged with the previous chunk.

    Returns:
        list: A new list of merged chunks.
    """
    if not chunks:
        return []

    merged_chunks = []
    # Start with the first chunk
    current_merged_chunk = list(chunks[0])
    current_word_count = count_words_in_chunk(current_merged_chunk)

    # Iterate through the rest of the chunks
    for next_chunk in chunks[1:]:
        next_chunk_word_count = count_words_in_chunk(next_chunk)

        # If adding the next chunk doesn't exceed the target, merge it
        if current_word_count + next_chunk_word_count <= target_word_count:
            current_merged_chunk.extend(next_chunk)
            current_word_count += next_chunk_word_count
        else:
            # Otherwise, finalize the current merged chunk
            merged_chunks.append(current_merged_chunk)
            # And start a new one
            current_merged_chunk = list(next_chunk)
            current_word_count = next_chunk_word_count

    # Don't forget to add the last processed chunk
    if current_merged_chunk:
        merged_chunks.append(current_merged_chunk)

    # --- NEW: Handle small trailing chunks ---
    # If there are at least two chunks and the last one is too small, merge it back.
    if len(merged_chunks) >= 2 and count_words_in_chunk(merged_chunks[-1]) < last_section_buffer:
        # Extend the second-to-last chunk with the contents of the last one
        merged_chunks[-2].extend(merged_chunks[-1])
        # Remove the now-absorbed last chunk
        merged_chunks.pop()

    return merged_chunks

In [None]:
# --- Citation Logic ---

def add_citation_positional_references(chunks):
    """
    For each chunk, the chunk has a position in the whole document
    For each element within that chunk, it has a positional value
    We want to create a mapping between the elements and the chunk for future use
    This future use can use the chunk id as a main citation, or it can break down
    The existing chunk into pin-point citations for the sections
    We will also introduce the absolute position of a section in the entire document
    This gives a large structure to exploit where a section can be located by its chunk
    Or a section element can be located by its absolute document position
    Args:
        chunks (list): A list of chunks. Which are lists of section elements
    Returns:
        list: The list of chunks with all the positional markers added
    """
    cited_chunks = []
    document_position = 1
    # Enumerate from 1 to get human-readable chunk IDs
    for chunk_id, chunk in enumerate(chunks, 1):
        new_chunk = []
        # Enumerate from 1 for passage IDs within the chunk
        for passage_id, passage in enumerate(chunk, 1):
            # Add a citation to the passage level for the chunk
            # This tags the chunk_id with the section passage id
            # Create a copy to avoid modifying the original data in place.
            new_passage = passage.copy()
            new_passage['chunk_position'] = chunk_id
            new_passage['intra_chunk_position'] = passage_id
            new_passage['document_position'] = document_position
            document_position += 1
            new_chunk.append(new_passage)
        cited_chunks.append(new_chunk)
    return cited_chunks

def find_highest_heading(passages, min_necessary=1):
    # Since not all documents will have the main heading at # in markdown
    # We want to adapt dynamically to the largest heading (which is the min(h1, h2...) html element
    passage_counts = Counter([x['type'] for x in passages])
    # min_necessary is set to greater than 1 to make sure that our splits aren't on a singular section heading
    possible_headings = [passage for passage, _count in passage_counts.items() if _count > min_necessary]
    possible_headings = [x for x in possible_headings if 'heading' in x]
    possible_headings = [int(x.split('_')[1]) for x in possible_headings]
    if len(possible_headings) == 0:
        return -1
    else:
        return min(possible_headings)

if __name__ == '__main__':
    # --- Configuration ---
    JSON_FILE_PATH = 'test.json'
    BASE_CITATION = "RL31572"
    # Config for the initial splitting phase
    INITIAL_MAX_WORDS = 500
    INITIAL_BUFFER = 250
    MINIMUM_NECESSARY_HEADINGS = 1
    # Config for the merging phase
    # This can be set to be higher if the actual retrieval text is desired to be bigger
    # This can help offset small chunks with context around it
    TARGET_MERGE_WORD_COUNT = 500
    TARGET_MERGE_BUFFER = 250

    try:
        # Load the document from the JSON file
        with open(JSON_FILE_PATH, 'r', encoding='utf-8') as f:
            document_data_raw = json.load(f)

        # Add original document index to each passage at the beginning
        document_data = []
        for i, passage in enumerate(document_data_raw):
            passage['document_citation'] = BASE_CITATION
            document_data.append(passage)

        # --- Step 1: Perform the initial hierarchical chunking ---
        highest_heading = find_highest_heading(document_data, min_necessary=MINIMUM_NECESSARY_HEADINGS)
        initial_chunks = chunk_document(document_data, INITIAL_MAX_WORDS, INITIAL_BUFFER, target_level=highest_heading)
        print(f"Step 1: Document initially split into {len(initial_chunks)} chunks.\n")

        # --- Step 2: Perform the merging post-processing ---
        final_merged_chunks = merge_chunks(initial_chunks, TARGET_MERGE_WORD_COUNT, TARGET_MERGE_BUFFER)
        print("---" * 15)
        print(f"\nStep 2: Merged into {len(final_merged_chunks)} final chunks.")
        print(f"Target Merge Size: {TARGET_MERGE_WORD_COUNT} words\n")

        # --- Step 3: Add citations to the final chunks ---
        final_cited_chunks = add_citation_positional_references(final_merged_chunks)
        print("---" * 15)
        print(f"\nStep 3: Added citations to {len(final_cited_chunks)} chunks.")
        print(f"Base Citation: '{BASE_CITATION}'\n")

        # --- Output and Verification ---
        print("---" * 15)
        print("\nFinal Output Verification:\n")
        total_word_count = 0
        for i, chunk in enumerate(final_cited_chunks):
            word_count = count_words_in_chunk(chunk)
            total_word_count += word_count

            first_item_type = chunk[0].get('type', 'N/A')
            first_item_content = chunk[0].get('content', '').replace('\n', ' ')[:70]
            first_item_citation = chunk[0].get('citation', 'N/A')
            first_item_doc_index = chunk[0].get('doc_index', -1)

            print(f"--- Final Chunk {i+1} ---")
            print(f"  Word Count: {word_count}")
            print(f"  Items: {len(chunk)}")
            print(f"  Starts with '{first_item_type}': \"{first_item_content}...\"")
            print(f"  First Citation: {first_item_citation}")
            print(f"  First Doc Index: {first_item_doc_index}")
            print()

        print(f"Total word count across all final chunks: {total_word_count}")
        print(f"Total word count in original document: {count_words_in_chunk(document_data)}")

    except FileNotFoundError:
        print(f"Error: The file '{JSON_FILE_PATH}' was not found.")
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from the file '{JSON_FILE_PATH}'.")


In [None]:
final_cited_chunks[-2]

In [None]:
save_folder = Path('parsed')
save_folder.mkdir(parents=True, exist_ok=True)

errors_folder = Path('/Users/jameslittiebrant/Data/crs_reports/errors')
errors_folder.mkdir(parents=True, exist_ok=True)

In [None]:
json_files = Path('/Users/jameslittiebrant/Data/crs_reports/reports/reports')
json_files = [x for x in json_files.glob('*.json')]

In [None]:
with open(json_files[0], 'r', encoding='utf-8') as f:
    data = json.load(f)

In [None]:
def parse_report_metadata(metadata):
    parsed_metadata = dict()
    parsed_metadata['id'] = metadata['id']
    parsed_metadata['type'] = metadata['type']
    parsed_metadata['typeId'] = metadata['typeId']
    parsed_metadata['number'] = metadata['number']
    parsed_metadata['active'] = metadata['active']
    parsed_metadata['source'] = metadata['source']
    parsed_metadata['topics'] = metadata['topics']
    _version_info = metadata['versions'][0]
    parsed_metadata['version_id'] = _version_info['id']
    parsed_metadata['date'] = _version_info['date']
    parsed_metadata['retrieved_date'] = _version_info['retrieved']
    parsed_metadata['title'] = _version_info['title']
    parsed_metadata['summary'] = _version_info['summary']
    parsed_metadata['source_file'] = [x for x in _version_info['formats'] if x['format'] == 'HTML'][0]['filename']
    return parsed_metadata

In [None]:
parse_report_metadata(data)

In [None]:
def load_file(json_file, source_folder):
    with open(source_folder.joinpath(f"{json_file}"), 'r', encoding='utf-8') as f:
        data = json.load(f)
    try:
        filename = [x for x in data['versions'][0]['formats'] if x['format'] == 'HTML'][0]['filename']
    except IndexError as e:
        return False, json_file.name, ''

    with open(source_folder.joinpath(f'{filename}'), 'r') as f:
        doc = f.read()
    return True, doc, data

In [None]:
# I'm targeting around 500 words because quite often these can run over that
# So this is more of a baseline for getting the initial chunk before merging it where there might be about a doubling
# For instance, an initial max word of 500 + 250 buffer = 750
# If there is one chunk at 400, and the next at 750, then we're at 1150 words. If it's an end, then + 250 = 1400 words.
# This provides some level of variability, but it's at a level that is closer to a meaningful segment of a document at the expense of variability
INITIAL_MAX_WORDS = 500
INITIAL_BUFFER = 250
MINIMUM_NECESSARY_HEADINGS = 1
TARGET_MERGE_WORD_COUNT = 500
TARGET_MERGE_BUFFER = 250

In [None]:
def format_chunks(chunk, join_separator='\n\n'):
    chunk_text = join_separator.join([x['content'].strip() for x in chunk])
    chunk_type = chunk[0].get('type','')
    document_citation = chunk[0].get('document_citation')
    chunk_position = chunk[0].get('chunk_position')
    element_ids = [x.get('id') for x in chunk]
    chunk_section_start = chunk[0].get('document_position')
    chunk_section_end = chunk[-1].get('document_position')
    chunk_element = {
        'content':chunk_text,
        'type':chunk_type,
        'document_citation':document_citation,
        'chunk_position':chunk_position,
        'element_ids':element_ids,
        'chunk_start':chunk_section_start,
        'chunk_end':chunk_section_end
    }
    return chunk_element

In [None]:
final_cited_chunks[0]

In [None]:
errored_files = list()
re_parse_files = list()
parsed_reports_metadata = list()
parsed_sections = list()
parsed_chunks = list()

In [None]:
start_point = len(errored_files) + len(parse_report_metadata(data))
for json_file in tqdm(json_files[start_point:]):
    did_load, html_document, metadata = load_file(json_file, report_folder)
    if not did_load:
        with open(json_file, 'r', encoding='utf-8') as f:
            _error_data = json.load(f)
        errored_files.append({'filename': json_file.name, 'json':_error_data, 'error':'error_loading'})
        continue

    if len(html_document.strip()) == 0:
        print('no html')
        with open(json_file, 'r', encoding='utf-8') as f:
            _error_data = json.load(f)
        errored_files.append({'filename': json_file.name, 'json':_error_data, 'error':'no_html'})
        continue

    metadata = parse_report_metadata(metadata)
    document_data_raw = parse_html_content(html_document)
    if len(document_data_raw) == 0:
        errored_files.append({'filename': json_file.name, 'json':metadata, 'error':'html_did_not_parse'})
        continue
    document_data = []
    for i, passage in enumerate(document_data_raw):
        passage['document_citation'] = metadata['id']
        document_data.append(passage)

    highest_heading = find_highest_heading(document_data, min_necessary=MINIMUM_NECESSARY_HEADINGS)
    initial_chunks = chunk_document(document_data, INITIAL_MAX_WORDS, INITIAL_BUFFER, target_level=highest_heading)
    merged_chunks = merge_chunks(initial_chunks, TARGET_MERGE_WORD_COUNT, TARGET_MERGE_BUFFER)
    cited_chunks = add_citation_positional_references(merged_chunks)
    formatted_chunks = [format_chunks(chunk, '\n\n') for chunk in cited_chunks]

    parsed_reports_metadata.append(metadata)
    parsed_chunks.extend(formatted_chunks)
    for chunk in cited_chunks:
        parsed_sections.extend(chunk)

In [None]:
len(parsed_sections), len(parsed_chunks), len(parsed_reports_metadata)

In [None]:
for _chunk in parsed_chunks:
    _hash_string = f"{_chunk['chunk_start']}_{_chunk['chunk_end']}_{_chunk['content']}"
    _chunk['id'] = f"chunk-{hash_text_to_id(_hash_string, n_digits=10)}"
    _chunk['document_id'] = _chunk['document_citation']

In [None]:
for section in parsed_sections:
    section['document_id'] = section['document_citation']

In [None]:
for _report_metadata in parsed_reports_metadata:
    _report_metadata['type_id'] = _report_metadata['typeId']
    del _report_metadata['typeId']

In [None]:
for _report_metadata in parsed_reports_metadata:
    _report_metadata['version_id'] = str(_report_metadata['version_id'])

In [None]:
from datetime import datetime
run_time = datetime.now().strftime("%Y%m%d_%H%M")
run_time = '20250723_1319'
with open(parsed_folder.joinpath(f'{run_time}_sections.json'), 'w') as f:
    json.dump(parsed_sections, f)
with open(parsed_folder.joinpath(f'{run_time}_chunks.json'), 'w') as f:
    json.dump(parsed_chunks, f)
with open(parsed_folder.joinpath(f'{run_time}_files.json'), 'w') as f:
    json.dump(parsed_reports_metadata, f)
with open(parsed_folder.joinpath(f'{run_time}_errors.json'), 'w') as f:
    json.dump(errored_files, f)

In [None]:
with open(parsed_folder.joinpath(f'{run_time}_files.json'), 'w') as f:
    json.dump(parsed_reports_metadata, f)

In [None]:
for key, value in parsed_sections[0].items():
    if isinstance(value, str):
        value_str = 'str'
    elif isinstance(value, int):
        value_str = 'int'
    elif isinstance(value, list):
        value_str = 'List[str]'
    elif isinstance(value, float):
        value_str = 'float'
    else:
        value_str = str(type(value))
    print(f"{key}: {value_str}")