In [22]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain_community.vectorstores import Chroma
import chromadb

from dotenv import load_dotenv
import os

In [23]:
c_client = chromadb.HttpClient(host='0.0.0.0',port=8000)
# c_client.get_or_create_collection('anpd', metadata={"hnsw:space": "cosine"})
c_client.list_collections()

[Collection(name=anpd)]

In [24]:
c_client.get_collection('anpd').count()

111

In [4]:
load_dotenv("src/data_ingestion/.env",verbose=True)

True

In [7]:
from glob import glob

In [9]:
glob("src/data_ingestion/docs/*.pdf")

['src/data_ingestion/docs/web-guia-anpd-tratamento-de-dados-para-fins-academicos.pdf']

In [19]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPEN_AI_KEY")

In [20]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False
)

loader = PyPDFLoader("docs/web-guia-anpd-tratamento-de-dados-para-fins-academicos.pdf")
docs = loader.load_and_split(text_splitter=splitter)

In [21]:
embedding = OpenAIEmbeddings(model='text-embedding-3-large')

## Chroma

In [10]:
c_client = chromadb.HttpClient(host='localhost',port=8000)
# c_client.reset()
# c_client.get_or_create_collection('anpd', metadata={"hnsw:space": "cosine"})
c_client.list_collections()

[Collection(name=anpd)]

In [7]:
db = Chroma(collection_name='anpd', embedding_function=embedding, 
            client=c_client)

In [8]:
c_client.get_collection('anpd').count()

0

In [9]:
_ = db.add_documents(docs)

In [10]:
c_client.list_collections()

[Collection(name=anpd)]

In [11]:
c_client.get_collection('anpd').count()

116

## chain

In [12]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [13]:
template = """Answer the question based only on the following context:

{context}

Question: {question}
"""

In [14]:
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(temperature=0.4)

In [15]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

In [16]:
chain = (
    {"context": db.as_retriever() | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

print(chain.invoke("No caso de órgãos públicos a disponibilização de acesso a dados pessoais pode acontecer em docorrência do que?"))

No caso de órgãos públicos, a disponibilização de acesso a dados pessoais pode acontecer em decorrência do cumprimento de obrigação legal ou quando necessário à execução de políticas públicas.
