In [1]:
# Scrape, parse, load (initial to db), embed/preprocess, package, chroma load, retrieve
# Query, preprocess, retrieve, rerank, final retrieve
# need to go back and add logging to everything

# * need concept of vector loader vs text loader
from typing import Dict, List

from icesrag.utils.embed.strategy_pattern import EmbeddingEngine
from icesrag.utils.embed.base_embedders import SentenceTransformEmbedder

from icesrag.utils.text_preprocess.strategy_pattern import TextPreprocessingEngine
from icesrag.utils.text_preprocess.bm25_preprocess import BM25PreProcess

from icesrag.load.package.strategy_pattern import PackageEngine
from icesrag.load.package.chroma_packager import PackageChroma
from icesrag.load.package.sqlite_packager import PackageSQLite

from icesrag.load.store.strategy_pattern import DatabaseEngine
from icesrag.load.store.chromadb import ChromaDBStore
from icesrag.load.store.sqlitedb import SQLiteDBStore

from icesrag.load.pipeline.loader import CompositeLoader

  from tqdm.autonotebook import tqdm, trange


In [2]:
chunks = ['This is the third test sentence.', 'Here goes another one.']
metadatas = [{'test_number':3}, {'test_number':4}]

vanilla_embedder = EmbeddingEngine(SentenceTransformEmbedder())
vanilla_packager = PackageEngine(PackageChroma())

bm25_preprocessor = TextPreprocessingEngine(BM25PreProcess())
bm25_packager = PackageEngine(PackageSQLite())

vanilla_store = DatabaseEngine(ChromaDBStore())
dbpath = r"C:\Users\hunte\Downloads\vanillatest.db"
collection_name = 'test'
vanilla_store.connect(dbpath, collection_name)

bm25_store = DatabaseEngine(SQLiteDBStore())
dbpath = r"C:\Users\hunte\Downloads\ragtest.db"
collection_name = 'test'
bm25_store.connect(dbpath, collection_name)

strategies = [
                {'name':'vanilla',
                'type':'vector',
                'embed': vanilla_embedder,
                'package': vanilla_packager,
                'store': vanilla_store},
                
                {'name':'bm25',
                'type':'text',
                'preprocess': bm25_preprocessor, # allow preprocessing to be a list of steps
                'package': bm25_packager,
                'store': bm25_store
                } 
            ]

loader = CompositeLoader(strategies=strategies)
loader.prepare_load(chunks, metadatas)