In [1]:

%pip install -U langchain-text-splitters langchain-openai langchain-community chromadb pypdf

# Método do langchain para ajudar a fazer a separação dos documentos em chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Embeddings e LLM
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
# Banco vetorial
from langchain_community.vectorstores import Chroma
# Carregar arquivo PDF
from langchain_community.document_loaders import PyPDFLoader
# Prompt para juntar a pergunta relevante com o Chunk
from langchain.chains.question_answering import load_qa_chain



Collecting certifi (from httpx<1,>=0.23.0->langsmith<0.4,>=0.1.126->langchain-core<1.0.0,>=0.3.51->langchain-text-splitters)
  Using cached certifi-2025.4.26-py3-none-any.whl.metadata (2.5 kB)
Collecting idna (from httpx<1,>=0.23.0->langsmith<0.4,>=0.1.126->langchain-core<1.0.0,>=0.3.51->langchain-text-splitters)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests<3,>=2->langsmith<0.4,>=0.1.126->langchain-core<1.0.0,>=0.3.51->langchain-text-splitters)
  Using cached urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting distro<2,>=1.7.0 (from openai<2.0.0,>=1.68.2->langchain-openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Using cached idna-3.10-py3-none-any.whl (70 kB)
Using cached urllib3-2.4.0-py3-none-any.whl (128 kB)
Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Using cached certifi-2025.4.26-py3-none-any.whl (159 kB)
Installing collected packages: urllib3, idna, distro, certifi
[2K   [90m━━━━━━━

In [2]:
import os

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_api_key


In [4]:
# Load dos modelos (Embeddings e LLM)

embeddings_model = OpenAIEmbeddings()
llm = ChatOpenAI(model="gpt-3.5-turbo", max_tokens=200)


In [5]:
# Carregar o PDF
pdf_link = "EntendendoAlgoritmos.pdf"
loader = PyPDFLoader(pdf_link, extract_images=False)
# Vai iniciar e carregar o PDF em páginas
pages = loader.load_and_split()


In [6]:
# Separar em Chunks (Pedaços de documentos)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 4000,
    chunk_overlap = 20,
    length_function = len,
    add_start_index = True
)

chunks = text_splitter.split_documents(pages)

In [7]:
# Salvar no Vector DB - Chroma. Ao fazer isso uma pasta será criada com nome "text_index"
# Pode demorar um pouco
db = Chroma.from_documents(chunks, embedding=embeddings_model, persist_directory="text_index")
db.persist()


  db.persist()


In [8]:
# Carregar os arquivos do DB
vector_db = Chroma(persist_directory="text_index", embedding_function=embeddings_model)
# Método para recuperar do vector_db os chunks relevantes (nesse caso os 3 mais relevantes)
retriever = vector_db.as_retriever(search_kwargs ={"k":3})
# Construção da cadeia de prompt para chamada do LLM
chain = load_qa_chain(llm, chain_type="stuff")

  vector_db = Chroma(persist_directory="text_index", embedding_function=embeddings_model)
stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(llm, chain_type="stuff")


In [9]:
def ask(question):
    context = retriever.get_relevant_documents(question)
    answer = (chain({"input_documents": context, "question": question}, return_only_outputs=True))['output_text']
    return answer

In [12]:
user_question = input("User: ")
answer = ask(user_question)
print("Answer: ", answer)

Answer:  Sim, o nome do segundo capítulo deste livro é "Você aprenderá duas estruturas de dados fundamentais: arrays e listas encadeadas".
