In [1]:
from pathlib import Path
from dotenv import load_dotenv
import openai

from rag_diary.config import Config

current_folder = Path(globals()['_dh'][0])
load_dotenv(Path(current_folder).parent / ".env")

config = Config()
openai.api_key = config.OPENAI_API_KEY

In [2]:
from rag_diary.vector_store_chromadb import VectorStoreChromadb, get_chromadb_collection
from langchain_openai.embeddings import OpenAIEmbeddings
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction


db_path = Path(current_folder).joinpath("../private.db.chroma") 
embedding_function = OpenAIEmbeddings(api_key=config.OPENAI_API_KEY, model=config.OPENAI_MODEL_NAME)

collection_name = "langchain_retrival"
embeddings = OpenAIEmbeddings()
client = chromadb.PersistentClient(path=str(db_path.absolute()))
collection = get_chromadb_collection(
    client,
    collection_name=config.DEFAULT_DB_NAME,
    embedding_function=embedding_function
)
vector_store = VectorStoreChromadb(client=client, collection=collection)

In [3]:
docs = [
    "Basquetball is a great sport.",
    "Fly me to the moon is one of my favourite songs.",
    "The Celtics are my favourite team.",
    "This is a document about the Boston Celtics",
    "I simply love going to the movies",
    "The Boston Celtics won the game by 20 points",
    "This is just a random text.",
    "Elden Ring is one of the best games in the last 15 years.",
    "L. Kornet is one of the best Celtics players.",
    "Larry Bird was an iconic NBA player.",
]
vector_store.add_multiple(docs, [{"id":idx} for idx in range(len(docs))])

In [4]:
query = "what is the best game"
data = vector_store.query_by_str(query)
docs = [d["document"] for d in data]

docs

['Elden Ring is one of the best games in the last 15 years.',
 'Elden Ring is one of the best games in the last 15 years.',
 'Elden Ring is one of the best games in the last 15 years.',
 'Basquetball is a great sport.',
 'Basquetball is a great sport.',
 'Basquetball is a great sport.',
 'The Celtics are my favourite team.',
 'The Celtics are my favourite team.',
 'The Celtics are my favourite team.',
 'The Boston Celtics won the game by 20 points']

In [5]:
from langchain_community.vectorstores import Chroma

texts = [
    "Basquetball is a great sport.",
    "Fly me to the moon is one of my favourite songs.",
    "The Celtics are my favourite team.",
    "This is a document about the Boston Celtics",
    "I simply love going to the movies",
    "The Boston Celtics won the game by 20 points",
    "This is just a random text.",
    "Elden Ring is one of the best games in the last 15 years.",
    "L. Kornet is one of the best Celtics players.",
    "Larry Bird was an iconic NBA player.",
]

db = Chroma(
    collection_name=collection_name,
    embedding_function=embedding_function,
    persist_directory=str(str(db_path.absolute()))
)

db.add_texts(texts=texts)
   
retriever = db.as_retriever()
query = "What is the best game"

docs = retriever.get_relevant_documents(query)
print(docs)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[Document(page_content='Elden Ring is one of the best games in the last 15 years.'), Document(page_content='Basquetball is a great sport.'), Document(page_content='The Celtics are my favourite team.'), Document(page_content='I simply love going to the movies')]


In [6]:
from langchain_community.document_transformers import LongContextReorder

# Reorder the documents:
# Less relevant document will be at the middle of the list and more
# relevant elements at beginning / end.
reordering = LongContextReorder()
reordered_docs = reordering.transform_documents(docs)

[print(type(doc)) for doc in reordered_docs]
# Confirm that the 4 relevant documents are at beginning and end.
reordered_docs

<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>


[Document(page_content='Basquetball is a great sport.'),
 Document(page_content='I simply love going to the movies'),
 Document(page_content='The Celtics are my favourite team.'),
 Document(page_content='Elden Ring is one of the best games in the last 15 years.')]

In [9]:
from langchain.chains import LLMChain, StuffDocumentsChain
from langchain_openai import OpenAI
from langchain.prompts import PromptTemplate

template = """Given this text extracts:
# -----
# {context}
# -----
# Please answer the following question:
# {query}"""
document_variable_name = "context"
prompt = PromptTemplate(
    template=template,
    input_variables=["query", document_variable_name ],
)

  

In [10]:
document_prompt = PromptTemplate(
    input_variables=["page_content"],
    template="{page_content}"
)

llm = OpenAI()
# The prompt here should take as an input variable the
# `document_variable_name`

llm_chain = LLMChain(llm=llm, prompt=prompt)
chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name="context"
)

chain.invoke( query=query, input={"query": "what is the best game", "input_documents": reordered_docs})

{'query': 'what is the best game',
 'input_documents': [Document(page_content='Basquetball is a great sport.'),
  Document(page_content='I simply love going to the movies'),
  Document(page_content='The Celtics are my favourite team.'),
  Document(page_content='Elden Ring is one of the best games in the last 15 years.')],
 'output_text': ' in the last 15 years?\n\nElden Ring is one of the best games in the last 15 years.'}

ValueError: Collection langchain_retrival does not exist.