In [None]:
import gzip
import json
import lancedb
import pandas as pd
from lancedb.pydantic import Vector, LanceModel, List
from pathlib import Path

from pydantic import BaseModel
# from typing import List
import pydantic

from tqdm.notebook import tqdm

In [None]:
import hashlib
def hash_text_to_id(source_string: str, n_digits: int = 10) -> str:
    hash_object = hashlib.sha256(source_string.encode())
    hex_digest = hash_object.hexdigest()
    hash_int = int(hex_digest, 16)
    numeric_id = hash_int % (10**n_digits)
    return f"{numeric_id:0{n_digits}d}"

In [None]:
vector_folder = Path('/Users/jameslittiebrant/Data/crs_reports/vectors')
parsed_folder = Path('/Users/jameslittiebrant/Data/crs_reports/parsed')

run_time = "20250723_1319"
vector_data_path = vector_folder.joinpath(f"{run_time}_vectors.json.gz")
metadata_file = parsed_folder.joinpath(f"{run_time}_files.json")

index_folder = Path('/Users/jameslittiebrant/Data/Mycroft/indexes')
index_table_name = 'crs_reports'

In [None]:
metadata = pd.read_json(metadata_file, convert_dates=False)

In [None]:
with gzip.open(vector_data_path,'rt') as f:
    data = json.load(f)

In [None]:
metadata

In [None]:
metadata['summary'] = metadata['summary'].fillna('')

In [None]:
def look_up_metadata(document_citation, metadata):
    document = metadata[metadata['id'] == document_citation].drop('id', axis=1)
    return document.to_dict(orient='records')[0]

In [None]:
for _record in tqdm(data):
    document_metadata = look_up_metadata(_record['document_citation'], metadata)
    _record.update(document_metadata)

In [None]:
for key, value in data[0].items():
    if isinstance(value, str):
        value_str = 'str'
    elif isinstance(value, int):
        value_str = 'int'
    elif isinstance(value, list):
        value_str = 'List[str]'
    elif isinstance(value, float):
        value_str = 'float'
    else:
        value_str = str(type(value))
    print(f"{key}: {value_str}")

In [None]:
class ChunkLanceModel(BaseModel):
    content: str
    type: str
    document_citation: str
    chunk_position: int
    element_ids: List[str]
    chunk_start: int
    chunk_end: int
    vector: Vector(768)
    id: str
    document_id: str
    number: str
    active: int
    source: str
    topics: List[str]
    version_id: str
    date: str
    retrieved_date: str
    title: str
    summary: str
    source_file: str
    type_id: str

In [None]:
index = lancedb.connect(index_folder)

In [None]:
table = index.create_table(index_table_name,
                           data=data,
                           schema=lancedb.pydantic.pydantic_to_schema(ChunkLanceModel),
                           mode='overwrite')