In [None]:
# Scrape, parse, load (initial to db), embed/preprocess, package, chroma load, retrieve
# Query, preprocess, retrieve, rerank, final retrieve
# * need to go back and add logging to everything

from typing import Dict, List

from icesrag.utils.embed.strategy_pattern import EmbeddingEngine
from icesrag.utils.embed.base_embedders import SentenceTransformEmbedder

from icesrag.utils.text_preprocess.strategy_pattern import TextPreprocessingEngine
from icesrag.utils.text_preprocess.bm25_preprocess import BM25PreProcess

from icesrag.load.package.strategy_pattern import PackageEngine
from icesrag.load.package.chroma_packager import PackageChroma
from icesrag.load.package.sqlite_packager import PackageSQLite

from icesrag.load.store.strategy_pattern import DatabaseEngine
from icesrag.load.store.chromadb import ChromaDBStore
from icesrag.load.store.sqlitedb import SQLiteDBStore

from icesrag.load.pipeline.loader import CompositeLoader

#################

from icesrag.retrieve.rerank.rrf import ReciprocalRerankFusion
from icesrag.retrieve.rerank.strategy_pattern import ReRankEngine

from icesrag.retrieve.retrievers.chroma import ChromaRetriever
from icesrag.retrieve.retrievers.sqlite import SQLiteRetriever
from icesrag.retrieve.retrievers.strategy_pattern import RetrieverEngine

from icesrag.retrieve.pipeline.retriever import CompositeRetriever

  from tqdm.autonotebook import tqdm, trange


### Load

In [2]:
chunks = ['This is the third test sentence.', 'Here goes another one.']
metadatas = [{'test_number':3}, {'test_number':4}]

vanilla_embedder = EmbeddingEngine(SentenceTransformEmbedder())
vanilla_packager = PackageEngine(PackageChroma())

bm25_preprocessor = TextPreprocessingEngine(BM25PreProcess())
bm25_packager = PackageEngine(PackageSQLite())

vanilla_store = DatabaseEngine(ChromaDBStore())
dbpath = r"C:\Users\hunte\Downloads\vanillatest.db"
collection_name = 'test'
vanilla_store.connect(dbpath, collection_name)

bm25_store = DatabaseEngine(SQLiteDBStore())
dbpath = r"C:\Users\hunte\Downloads\ragtest.db"
collection_name = 'test'
bm25_store.connect(dbpath, collection_name)

# * could remove type (pretty sure it goes unused)
strategies = [
                {'name':'vanilla',
                 'type':'vector',
                 'embed': vanilla_embedder,
                 'package': vanilla_packager,
                 'store': vanilla_store
                },
                
                {'name':'bm25',
                 'type':'text',
                 'preprocess': bm25_preprocessor, # allow preprocessing to be a list of steps
                 'package': bm25_packager,
                 'store': bm25_store
                } 
            ]

loader = CompositeLoader(strategies=strategies)
loader.prepare_load(chunks, metadatas)

### Retrieve

In [38]:
vanilla_embedder = EmbeddingEngine(SentenceTransformEmbedder())
vanilla_retriever = RetrieverEngine(ChromaRetriever())
dbpath = r"C:\Users\hunte\Downloads\vanillatest.db"
collection_name = 'test'
vanilla_retriever.connect(dbpath, collection_name)

bm25_preprocessor = TextPreprocessingEngine(BM25PreProcess())
bm25_retriever = RetrieverEngine(SQLiteRetriever())
dbpath = r"C:\Users\hunte\Downloads\ragtest.db"
collection_name = 'test'
bm25_retriever.connect(dbpath, collection_name)

reranker = ReRankEngine(ReciprocalRerankFusion())

# * could remove type (pretty sure it goes unused)
strategies = [
                {'name':'vanilla',
                 'embed': vanilla_embedder,
                 'retriever': vanilla_retriever
                 },
                
                {'name':'bm25',
                 'preprocess': bm25_preprocessor,
                 'retriever': bm25_retriever
                } 
            ]

retriever = CompositeRetriever(strategies, reranker)

[{'document_id': '1_2025-01-30 13:27:52',
  'fusion_score': 0.031754032258064516,
  'document': 'Here goes another one.',
  'metadata': {'test_number': 4}},
 {'document_id': '0_2025-01-30 13:27:52',
  'fusion_score': 0.03149801587301587,
  'document': 'This is the third test sentence.',
  'metadata': {'test_number': 3}}]

In [None]:
query = "is there another one in here by chance?"
retriever.retrieve(query, 2)

In [3]:
import chromadb

client = chromadb.PersistentClient(path="test")  # or HttpClient()
collections = client.list_collections() # in <0.6.0 returns the list of collection objects, in >=0.6.0 returns the list of collection names
print(collections)

[]


In [1]:
import os
import streamlit as st

st.write("Working dir:", os.getcwd())
st.write("Looking for config.toml at:", os.path.exists(".streamlit/config.toml"))


2025-04-07 21:31:02.280 
  command:

    streamlit run c:\Users\samue\OneDrive\Desktop\Local Folder\icesrag\.venv\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
