# Vector store example 
from https://bravecourses.circle.so/c/lekcje-programu/c03l03-wyszukiwanie-i-bazy-przetwarzanie-dlugich-dokumentow 
& https://github.com/i-am-alice/2nd-devs/blob/main/21_similarity/helpers.ts

# Similarity Search 

## Step 1 
- load data & 
- chunk it 
- create store 
or load previously created store 

this is exmample of how load simple text document to Chroma db and save it on dist. 
Then you can retreive it or create new db 

#### what is OpenAIEmbeddings class ? 
wrapper around OpenAI Embeddings API 
purpose:  OpenAIEmbeddings class in LangChain uses OpenAI's API to generate embeddings for the input text
https://github.com/langchain-ai/langchain/issues/12314

- first encodes the texts into tokens using the tiktoken package
- tokens are then split into chunks of a maximum length specified by the embedding_ctx_length attribute
- chunks are sent to the OpenAI API in batches of a size specified by the chunk_size attribute
- API response is then processed to extract the embeddings
    - If the skip_empty attribute is set to True, any empty embeddings are skipped
    - Otherwise an error is thrown
- Finally, the method averages the embeddings for each text, normalizes them, and returns them as a list.


In [None]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import dotenv
import codecs
from os import environ

# prep openapi key
env_file = './.env'
dotenv.load_dotenv(env_file, override=True)
print('MY_VAR = ', environ.get('MY_VAR'))
embedding_function=OpenAIEmbeddings(api_key=environ.get('OPENAI_API_KEY'))


def prepare_documents(file_path):
    if file_path.endswith(".txt"):  # assuming the documents are text files
        with codecs.open(file_path, 'r', encoding='utf8') as f:
            raw_document = f.read()
            return raw_document.split("\n\n")

def get_vector_store(load_new_chroma: bool=True) -> Chroma:

    existing_db = Chroma(persist_directory='various/vector_store_db', embedding_function=OpenAIEmbeddings())

    if not load_new_chroma and len(existing_db.get()) > 0:
        db = existing_db
        print('Loaded existing vector store')
    else: 
        
        ## COMENTED DOESN"T WORK AND ALL ANY SPLITTER LANGCHAIND DOESN"T WORK 
        ## USE prepare_documents() instead !!!!

        # loader = TextLoader('various/vector-store-example.txt')
        # documents = loader.load()
        # text_splitter = RecursiveCharacterTextSplitter( chunk_size=10000,
        #     chunk_overlap=100,
        #     separators=["\n\n", "\n", "(?<=\. )", " ", ""])
        # docs = text_splitter.split_documents(documents) ## returns  -> List[Document]:
      
        documents = prepare_documents('various/vector-store-example.txt')
        db = Chroma.from_texts(documents, OpenAIEmbeddings(), persist_directory='various/vector_store_db')
        print('Created new vector store')
    return db


#db = get_vector_store(True)
    

In [None]:

### DOENST WOKR  ########

### ANY ATEMPT  TO WORK WITH LANGCHAIN SPLITTER FAILS DON'T USE 

## leaving it here to remember that it doesn't work :)


# from langchain.text_splitter import RecursiveCharacterTextSplitter

# r_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
#     chunk_overlap=0,separators=["\n"])

# test = """a\nbcefg\nhij\nk"""
# print(len(test))
# tmp = r_splitter.split_text(test)
# print(tmp)


### that doesn't work its only spliting document when chunk_size is small like 100 it doesn't use separators

## Step 2 Now we need to search Chroma db with some queries

- similarity_search takes topK which defines how many elements are to be returned 
- it can be helpfull to return more and then filter /group them by f.e tags 
- filtering can be done right on search because most vecor db allows us to pass object that defines metadata similarity 

In [None]:
query = 'Do you know the name of Adams dog?'
vector_store = get_vector_store(False)
filters = {"page_title": "Adam"}
found_docs = vector_store.similarity_search_with_score(query, topK=0, filters=None)
print(found_docs)
#print(f"This will be my context for AI {found_docs[0].page_content}")

## Another example with loading documents from code not ext file 

#### something i don;t get about chroma 
is that is seems to persist its objects even after restart 
and when new run is done documents are doubled ...  

this is simple example wehere data is in documents and data is not partitioned 

In [None]:
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
import dotenv
from os import environ
# prep openapi key
env_file = './.env'
dotenv.load_dotenv(env_file, override=True)
doc=Document(
                page_content="text",
                metadata={"source": "local"}
            )
print(f" this is doc:  {doc.page_content}")

my_personal_docs = [
    Document(page_content="Charles is a xmen."),
    Document(page_content="Charles has a Cerebro czy jakos tak."),
    Document(page_content="Charles is also a proper wheeler.") 
]

vector_store_3 = Chroma.from_documents(my_personal_docs, embedding=OpenAIEmbeddings(api_key=environ.get('OPENAI_API_KEY')),persist_directory=False)

result  = vector_store_3.similarity_search("What does Charles do?",2);

print(f"vector store result: {result}")

## this is example where data is partitioned

- 2nd chun of data is not returned despite extending result count to 3 
- that needs to be adressed ( to be done in next lessons)


In [1]:
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from os import environ

my_personal_docs = [
    Document(page_content="Charles is a mutant who specializes in mind controll"),
    Document(page_content="with a particular focus on blowing those minds."),
    Document(page_content="Charles has an nice spaceship."),
    Document(page_content="Charles is also a proper wheeler.") 

]

vector_store_3 = Chroma.from_documents(my_personal_docs, embedding=OpenAIEmbeddings(api_key=environ.get('OPENAI_API_KEY')),persist_directory=False)

result  = vector_store_3.similarity_search("What does Charles do?", 3);

print(f"vector store result: {result}")
# ector store result: [
    # Document(page_content='Charles is a mutant who specializes in mind controll'), 
    # Document(page_content='Charles is also a proper wheeler.'), 
    # Document(page_content='Charles has an nice spaceship.')]
# missing the one with blowing minds which should be part of first response

 this is doc:  text
vector store result: [Document(page_content='Charles is a mutant who specializes in mind controll'), Document(page_content='Charles is also a proper wheeler.'), Document(page_content='Charles has an nice spaceship.')]


# Wyszukiwanie hybrydowe

- HSRAG Hybrid Search and Retrieval Augmented Generation.
    - this joins normal db searching with vector dbsearching 
- 
