## Get local db

In [2]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
import os
import openai
from dotenv import dotenv_values


env_vars = dotenv_values('.env')
openai.api_key = env_vars.get('OPENAI_API_KEY')

In [3]:
import os, sys
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

import utils.chroma as chom

In [15]:
file_path = '../data/sec.pdf'
pdftexts = chom.pdf_reader(file_path)
# pdftexts[0]

In [41]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# embedded_documents = []
# for code_content in pdftexts:
#     embedded_document = embeddings.embed_documents([code_content])
#     embedded_documents.append(embedded_document)

# embeddings
vectordb= Chroma(embedding_function=embeddings, collection_name="core")

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7208ec2999c0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7208eebb9ff0>, model='text-embedding-3-small', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

## We are going to do a Naive RAG.

## Remember:

- R -> Retrieval
- A -> Augmented
- G -> Generation

# Retrieval

In [33]:
naive_retriever = vectordb.as_retriever(search_kwargs={ "k" : 10})

# Similarity score threshold retrieval
# naive_retriever = db.as_retriever(search_kwargs={"score_threshold": 0.8}, search_type="similarity_score_threshold")

# Maximum marginal relevance retrieval
# naive_retriever = db.as_retriever(search_type="mmr")
naive_retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7208ef1ff220>, search_kwargs={'k': 10})

# Augmented

In [34]:
from langchain_core.prompts import ChatPromptTemplate

TEMPLATE = """\
You are happy assistant. Use the context provided below to answer the question.

Answer question in summarization, in one line of sentence. 

If you do not know the answer, or are unsure, say you don't know.

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

# Generation

In [35]:
from langchain_community.chat_models import ChatOpenAI

chat_model = ChatOpenAI()

## Finally, we are going to create a Rag Chain. For that, we are going to use LCEL (LangChain Expression Language)
If you want to learn more about LCEL, check this good tutorial: https://www.youtube.com/watch?v=O0dUOtOIrfs

### The simplest way to use LCEL

In [39]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": naive_retriever })
output_parser = StrOutputParser()


naive_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser


naive_retrieval_chain.invoke("Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?")


'Sellers are typically responsible for a breach of representations and warranties to the extent outlined in the sales agreement.'

In [27]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

question = "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"
retrieval_input = [{'context': 'Your retrieval context goes here'}, {'context': 'Another retrieval context goes here'}]

# Define a retriever function that accepts a list as input
def retrieve_fn(inputs):
    return inputs  # Return the input list as is

setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": retrieve_fn})
output_parser = StrOutputParser()

retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser

output = retrieval_chain.invoke({"question": question, "context": retrieval_input})
print(output)

I don't know.


In [None]:
naive_retrieval_chain.invoke("In what situation does the Sellers have liability for breach of representations and warranties?")

### A little more complex to use LCEL

In [None]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": itemgetter("question") |  RunnablePassthrough(), "context": itemgetter("question") |  naive_retriever })
output_parser = StrOutputParser()


naive_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser


naive_retrieval_chain.invoke({"question" : "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"})

'Sellers are responsible for breach of representations and warranties if the representations are untrue or warranties are not fulfilled; extent of responsibility depends on the specific terms of the agreement.'

### A little X2 more complex to use LCEL

In [None]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": itemgetter("question") |  RunnablePassthrough(), "context": itemgetter("question") }) | RunnablePassthrough.assign(context=itemgetter("context"))
output_parser = StrOutputParser()


naive_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser


naive_retrieval_chain.invoke({"question" : "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"})

'The Sellers are responsible for a breach of representations and warranties when the statements made are false or misleading, to the extent outlined in the contract.'

### A little X3 more complex to use LCEL

In [None]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

setup_and_retrieval = RunnableParallel({"question": itemgetter("question") |  RunnablePassthrough(), "context": itemgetter("question") | naive_retriever }) | RunnablePassthrough.assign(context=itemgetter("context"))

naive_retrieval_chain = setup_and_retrieval | {"response": rag_prompt | chat_model, "context": itemgetter("context")}


naive_retrieval_chain.invoke({"question" : "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"})

{'response': AIMessage(content='Sellers are responsible for breach of representations and warranties if they are proven to have knowingly provided false information, and the extent of their responsibility can vary based on the specifics of the situation.', response_metadata={'token_usage': {'completion_tokens': 37, 'prompt_tokens': 61, 'total_tokens': 98}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_3b956da36b', 'finish_reason': 'stop', 'logprobs': None}, id='run-aef70a5a-03b0-4e70-b2d9-ce32d59442de-0'),
 'context': []}

In [None]:
naive_retrieval_chain.invoke({"question" : "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"})

{'response': AIMessage(content='Sellers are responsible for a breach of representations and warranties when they provide inaccurate or false information about the product or service being sold. The extent of their responsibility depends on the terms outlined in the contract.', response_metadata={'token_usage': {'completion_tokens': 40, 'prompt_tokens': 61, 'total_tokens': 101}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_3b956da36b', 'finish_reason': 'stop', 'logprobs': None}, id='run-7aeaaf76-8da5-48e6-82c0-72d703c66575-0'),
 'context': []}