In [None]:
from sentence_transformers import SentenceTransformer
import lancedb
from pathlib import Path
import torch
from copy import deepcopy
import json
import gzip
import re

In [None]:
from tqdm.notebook import tqdm

In [None]:
encoder = SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', device='mps',trust_remote_code=True)

In [None]:
def format_chunks(chunks):
    markdown_text = list()
    for chunk in chunks:
        if 'heading' in chunk['type']:
            heading_strength = int(chunk['type'].split('_')[1])
            markdown_text.append("#"* heading_strength + ' ' + chunk['content'].strip() + '\n')
        if 'table' in chunk['type']:
            continue
        else:
            # Remove any trailing footnote numbers
            # markdown_text.append(f"[{chunk['citation']}]\n" + chunk['content'].rstrip('0123456789').strip() + f"\n[/{chunk['citation']}]\n")
            # Only vectorized_text
            raw_text = chunk['content'].strip()
            if '#_Toc' in raw_text:
                continue
            raw_text = re.sub(r'(http:[\w\W]+?)\)','',raw_text.strip())
            raw_text = re.sub(r'(\[|\])',' ', raw_text)
            raw_text = re.sub(r'(\(\))',' ', raw_text)
            raw_text = re.sub(r'(\s+)',' ',raw_text.strip())
            markdown_text.append(f"{raw_text.strip()}")
    return '\n'.join(markdown_text)

In [None]:
source_folder = Path('reports/parsed')
vector_file = Path('reports/vectors')

In [None]:
source_files = list(source_folder.glob('*.*'))
source_files[0]

In [None]:
source_files[0].name

In [None]:
with open(source_files[0], 'r') as f:
    data = json.load(f)

In [None]:
len(data)

In [None]:
data[0]

In [None]:
metadata_fields = ['id', 'type', 'typeId', 'number', 'active', 'source', 'topics', 'version_id', 'date', 'retrieved_date', 'title', 'summary', 'source_file']

In [None]:
def split_chunks(document, metadata_fields):
    passages = list()
    metadata = {k:v for k, v in document.items() if k in metadata_fields}
    for _chunk in document['chunks']:
        start_index = min([x['doc_index'] for x in _chunk])
        end_index = max([x['doc_index'] for x in _chunk])
        formatted_chunk = format_chunks(_chunk)
        passage_metadata = deepcopy(metadata)
        passage_metadata['passage_text'] = formatted_chunk
        passage_metadata['sections'] = deepcopy(_chunk)
        passage_metadata['start_index'] = start_index
        passage_metadata['end_index'] = end_index
        passages.append(passage_metadata)
    return passages

In [None]:
split_chunks(data[-100], metadata_fields)[1]

In [None]:
all_chunks = list()
for document in tqdm(data):
    passages = split_chunks(document, metadata_fields)
    all_chunks.extend(passages)

In [None]:
extended_name = source_files[0].name.replace('parsed','expanded')
with gzip.open(source_folder.joinpath(extended_name + '.gz'), 'wt') as f:
    json.dump(all_chunks, f)

In [None]:
extended_name

In [None]:
vectorized_chunks = list()
for document in tqdm(data):
    passages = split_chunks(document, metadata_fields)
    # print(len(passages))
    for passage in passages:
        passage['vector'] = encoder.encode(passage['passage_text'])
        torch.mps.empty_cache()
    vectorized_chunks.extend(passages)

In [None]:
passage