In [1]:
from sentence_transformers import SentenceTransformer
import lancedb
from pathlib import Path
import torch
from copy import deepcopy

In [2]:
from tqdm.notebook import tqdm

In [3]:
encoder = SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', device='mps',trust_remote_code=True)

!!!!!!!!!!!!megablocks not available, using torch.matmul instead
<All keys matched successfully>


In [4]:
index_folder = Path('../wonky_data/indexes')
index = lancedb.connect(index_folder)
tbl = index.open_table('sections_fts')
result = tbl.search('africa').limit(1).to_list()[0]
result.keys()

dict_keys(['id', 'type', 'typeId', 'number', 'active', 'topics', 'date', 'title', 'summary', 'doc_id', 'filename', 'source_file', 'text', 'vector', 'section_id', '_score'])

In [5]:
source_files = Path('../wonky_data/parsed_reports/sections')

In [6]:
list(source_files.glob('*.*'))[0]

PosixPath('../wonky_data/parsed_reports/sections/96-404.json')

In [7]:
import json
import gzip

In [8]:
data_files = source_files.glob('*.json')
data_files = list(data_files)
documents = list()
for file in tqdm(data_files):
    with open(file, 'r') as f:
        data = json.load(f)
        documents.append(data)

  0%|          | 0/21658 [00:00<?, ?it/s]

In [9]:
import math # Used for ceiling division if needed, though simple len(split()) is fine here

def combine_sections_by_word_count(sections, report_id, max_word_count):
    """
    Combines text sections into chunks, ensuring each chunk does not exceed
    the specified maximum word count.

    Args:
        sections (list): A list of strings, where each string is a text section.
        max_word_count (int): The maximum number of words allowed in a chunk.

    Returns:
        list: A list of strings, where each string is a combined chunk.
    """
    combined_chunks = []
    current_chunk = ""
    current_word_count = 0
    current_sections = list()
    combined_sections = list()
    combined_subsection_text = list()
    current_subsection_text = list()
    for section_id, section in sections.items():
        # Basic word count by splitting on space
        # Note: This is a simple approach and might not perfectly handle
        # all punctuation or multiple spaces.
        section_words = section.split(' ')
        # Filter out empty strings that can result from multiple spaces
        section_words = [word for word in section_words if word]
        section_word_count = len(section_words)

        # If the section itself is larger than the max count, add it as its own chunk
        if section_word_count > max_word_count and current_word_count == 0:
            # print(f"Warning: Section starting with '{section[:50]}...' exceeds max_word_count ({section_word_count} > {max_word_count}) and will be added as its own chunk.")
            combined_chunks.append(section + f" ({report_id}({section_id}))")
            combined_sections.append([int(section_id)])
            # Reset for the next potential chunk (although this chunk is done)
            current_chunk = ""
            current_word_count = 0
            current_sections = list()
            current_subsection_text = list()
            # print(current_sections)
            continue # Move to the next section

        # Check if adding the new section would exceed the limit
        if current_word_count + section_word_count <= max_word_count:
            # Add section to the current chunk
            if current_chunk: # Add a space if the chunk isn't empty
                current_chunk += "\n" + section + f" ({report_id}({section_id}))"
                current_sections.append(int(section_id))
                current_subsection_text.append({int(section_id):section})
            else:
                current_chunk = section + f" ({report_id}({section_id}))"
                current_sections.append(int(section_id))
                current_subsection_text.append({int(section_id):section})
            current_word_count += section_word_count
        else:
            # Current chunk is full, add it to the list
            if current_chunk:
                combined_chunks.append(current_chunk)
                combined_sections.append(current_sections)
                current_sections = list()
                current_chunk = ""
                # current_subsection_text = list()

            # Start a new chunk with the current section
            # Check again if this new section *alone* exceeds the limit
            if section_word_count <= max_word_count:
                current_chunk = section + f" ({report_id}({section_id}))"
                current_sections.append(int(section_id))
                current_subsection_text.append({int(section_id):section})
                current_word_count = section_word_count
            else:
                # This section is too large even on its own
                # print(f"Warning: Section starting with '{section[:50]}...' exceeds max_word_count ({section_word_count} > {max_word_count}) and will be added as its own chunk.")
                combined_chunks.append(section + f" ({report_id}({section_id}))")
                current_sections.append(int(section_id))
                current_subsection_text.append({int(section_id):section})
                combined_sections.append(current_sections)
                # Reset for the next potential chunk
                current_chunk = ""
                current_word_count = 0
                current_sections = list()
                current_subsection_text = list()
                # print(current_sections)


    # Add the last remaining chunk if it's not empty
    if current_chunk:
        combined_chunks.append(current_chunk)
        combined_sections.append(current_sections)

    return combined_chunks, combined_sections

for document in tqdm(documents):
    combined_text, combined_sections = combine_sections_by_word_count(document['sections'], document['id'], 2500)
    combined_vectors = encoder.encode(combined_text, padding=False, batch_size=3, show_progress_bar=True).tolist()
    torch.mps.empty_cache()
    for section_text, section_ids, vector in zip(combined_text, combined_sections, combined_vectors):
        section_data = deepcopy(document)
        section_data['section_ids'] = section_ids
        section_data['text'] = section_text
        section_data['section_start'] = section_ids[0] if isinstance(section_ids, list) else section_ids
        section_data['section_end'] = section_ids[-1] if isinstance(section_ids, list) else section_ids
        section_data['vector'] = vector
        split_sections.append(section_data)
        break
    break

section_ids

section_text

In [10]:
def vectorize_sections(documents, max_word_count=2500, batch_size=3):
    all_sections = list()
    for document in tqdm(documents):
        combined_text, combined_sections = combine_sections_by_word_count(document['sections'], document['id'], max_word_count)
        combined_vectors = encoder.encode(combined_text, padding=False, batch_size=batch_size, show_progress_bar=False).tolist()
        torch.mps.empty_cache()
        split_sections = list()
        for section_text, section_ids, vector in zip(combined_text, combined_sections, combined_vectors):
            section_data = deepcopy(document)
            section_data['section_ids'] = section_ids
            section_data['text'] = section_text
            section_data['section_start'] = section_ids[0] if isinstance(section_ids, list) else section_ids
            section_data['section_end'] = section_ids[-1] if isinstance(section_ids, list) else section_ids
            section_data['vector'] = vector
            section_data['sections'] = {int(subsection_id):subsection_text for subsection_id, subsection_text in document['sections'].items() if int(subsection_id) in section_ids}
            split_sections.append(section_data)
        all_sections.extend(split_sections)
    return all_sections

In [11]:
batch_size = 200
vector_batch_size = 3
max_word_count = 1000
save_folder = Path('../wonky_data/index_data/sections')
save_folder.mkdir(parents=True, exist_ok=True)

In [None]:
for start_idx in tqdm(range(800, len(documents), batch_size)):
    batch_data = documents[start_idx:start_idx+batch_size]
    batch_sections = vectorize_sections(batch_data, max_word_count=max_word_count, batch_size=vector_batch_size)
    with gzip.open(save_folder.joinpath(f"vectorized_sections_{start_idx:04}_{start_idx + batch_size:04}.json.gz"), 'wt') as f:
        json.dump(batch_sections, f)

  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [13]:
len(batch_sections)

1761