# Carregando chaves de api

In [3]:
from dotenv import load_dotenv
import os

def carregar_chaves_api():
    load_dotenv()

    groq_api_key = os.getenv('GROQ_API_KEY')
    lang_smith_tracing = os.getenv('LANGSMITH_TRACING')
    lang_smith_api_key = os.getenv('LANGSMITH_TRACING')


def imprimir_chaves_api():
    print(f"GROQ_API_KEY: {groq_api_key}")
    print(f'LANGSMITH_TRACING: {lang_smith_tracing}')
    print(f'LANGSMITH_API_KEY: {lang_smith_api_key}')


carregar_chaves_api()

# Definindo e chamando modelos que vão ser usados

In [5]:
from langchain_groq import ChatGroq

class Modelos:
    def __init__(self):
        self.models = {
            'google_gemma': 'gemma2-9b-it',
            'meta_llama': 'llama-3.3-70b-versatile',
            'deep_seek': 'deepseek-r1-distill-llama-70b-specdec'
        }

        self.google_llm = ChatGroq(model=self.models['google_gemma'])
        self.meta_llm = ChatGroq(model=self.models['meta_llama'])
        self.deep_seek_llm = ChatGroq(model=self.models['deep_seek'])

    def imprimir_llms_carregadas(self):
        print(f'GOOGLE_LLM: {self.google_llm}\n')
        print(f'META_LLM: {self.meta_llm}\n')
        print(f'DEEP_SEEK_LLM: {self.deep_seek_llm}\n')

new_obj = Modelos()
new_obj.imprimir_llms_carregadas()

GOOGLE_LLM: client=<groq.resources.chat.completions.Completions object at 0x7fda1903c470> async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7fda1903d520> model_name='gemma2-9b-it' model_kwargs={} groq_api_key=SecretStr('**********')

META_LLM: client=<groq.resources.chat.completions.Completions object at 0x7fda1903e6f0> async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7fda1903f8c0> model_name='llama-3.3-70b-versatile' model_kwargs={} groq_api_key=SecretStr('**********')

DEEP_SEEK_LLM: client=<groq.resources.chat.completions.Completions object at 0x7fda19058b00> async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7fda19059ca0> model_name='deepseek-r1-distill-llama-70b-specdec' model_kwargs={} groq_api_key=SecretStr('**********')



# Carregar Lista de PDFS dos Prospectos

In [7]:
from langchain_community.document_loaders import PyPDFLoader
import os

# Lista para armazenar os documentos carregados
diretorio_base='../data/minuta/'
docs_list = []
i = 1

# Itera sobre os arquivos no diretório e carrega os PDFs
for nome_arquivo in os.listdir(diretorio_base):
    if nome_arquivo.endswith(".pdf"):  # Verifica se o arquivo é um PDF
        caminho_pdf = os.path.join(diretorio_base, nome_arquivo)

        loader = PyPDFLoader(caminho_pdf)

        print(f'loader: {loader}\n\n')
                
        docs = await loader.aload()

        print(f'docs_loader: {docs[0].page_content[:100]}\n\n')
        print(docs[0].metadata)
        
        docs_list.append(docs)

        print(i)
        # i += 1
    if i == 1:
        break


loader: <langchain_community.document_loaders.pdf.PyPDFLoader object at 0x7fda1907b3e0>


docs_loader: ESTE DOCUMENTO É UMA MINUTA INICIAL SUJEITA A ALTERAÇÕES E COMP LEMENTAÇÕES, TENDO SIDO ARQUIVADO NA


{'producer': 'Acrobat Distiller 10.0.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2020-10-21T17:41:47-03:00', 'author': '', 'moddate': '2020-10-21T17:44:00-03:00', 'title': '', 'source': '../data/minuta/uni_co_20201021_Minuta%20do%20Prospecto%20Preliminar.pdf', 'total_pages': 667, 'page': 0, 'page_label': '1'}
1


# Dividir Texto em partes menores

In [82]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100000,  # chunk size (characters)
    chunk_overlap=20000,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs_list[0])

print(len(all_splits))  # ler tamanho total de divisões
len(all_splits[0].page_content)  # ler tamanho total de divisões da pagina 0

all_splits[10].metadata   # metadados do doc

667


{'producer': 'Acrobat Distiller 10.0.0 (Windows)',
 'creator': 'PScript5.dll Version 5.2.2',
 'creationdate': '2020-10-21T17:41:47-03:00',
 'author': '',
 'moddate': '2020-10-21T17:44:00-03:00',
 'title': '',
 'source': '../data/minuta/uni_co_20201021_Minuta%20do%20Prospecto%20Preliminar.pdf',
 'total_pages': 667,
 'page': 10,
 'page_label': '11',
 'start_index': 0}

# Instanciando Embedding

In [38]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

print(embeddings)

model_name='sentence-transformers/all-mpnet-base-v2' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False show_progress=False


# Usando FAISS para armazenar as incorporacoes

In [None]:
from langchain_community.vectorstores import FAISS

vector_store = FAISS.from_documents(all_splits, embeddings)

print(vector_store[:1])

# Usando InMemoryVectorStore para armazenar os embeddings

In [168]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

# print(all_splits[0])
document_ids = vector_store.add_documents(documents=all_splits[:100])

print(document_ids)

['b1727ccc-37a0-454a-b37b-74082e20cc20', '79b23d89-2ad0-4b0c-a3e3-a0a60fba4588', '26f6cd78-49ca-4928-9585-cd264a49b880', 'f414b9c0-76bf-47ee-9f32-b178d9d5229c', 'a78e5938-9ba4-4cb5-bfd8-494ba756296d', '04967d9e-d592-4569-a2db-47c48f4b0ca3', '08aaa934-e38a-4f05-a0db-db2e193c0cf5', 'fbbbb4a5-5fe3-4f1f-9c11-40a171afb04e', '2c0cd26a-d505-4b87-a4f3-f4ff1f55142d', '3ff3b87d-0143-4e1b-9ce3-fe3717e41035', '409d34bf-e9e7-47cd-91ad-2e7584350bf5', '3021a789-0a3d-477a-94e4-3dcbffba741e', '8ad07003-dafa-4695-a93c-6c375dd3b946', 'a1def3c1-57c9-44ec-9330-c21c35d17af0', '0eccfd57-cbdf-4347-a4fe-e695a934b074', '9b8ee828-1ed0-4465-add3-623831c2d4c7', 'e67288d0-8e1a-4ba8-a0f2-6ad58bb1ec8a', '68be18ba-ee42-4869-adde-1266dd36df23', '36e9a31c-ed65-4cf7-bb00-d031fa84177f', '93669d78-9c88-4ab7-8fd3-aa92f75347e2', '4a7aafbb-70d1-4738-a208-7ee59745cc05', '8d8f3879-7df1-49c2-99bc-4542d868c484', '79fa3052-5eb6-43ac-b0df-7c62e0d863b7', 'f0135224-65a8-47ca-9ab3-f86b7606bd15', 'b9b2d157-b55b-47e6-b228-0502684fa163',

In [170]:
from langchain import hub

# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")



In [172]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = new_obj.google_llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [174]:
response = graph.invoke({"question": "que tipo de documento é este?"})
print(response["answer"])

This document is a prospectus for a company's initial public offering (IPO).  
It provides detailed information about the company, its financials, and the terms of the offering.  
Prospectuses are legal documents that must be provided to potential investors before they can buy shares in a company. 





In [176]:
response = graph.invoke({"question": "me diga de que empresa pertence"})
print(response["answer"])

The provided context doesn't state which company the information pertains to.  



In [178]:
response = graph.invoke({"question": "identifique alguma informação sobre a empresa ou algo semelhante"})
print(response["answer"])

A Uni.co S.A. é uma empresa que possui uma plataforma para operar franquias, com foco em preservar o DNA de cada marca.  

A plataforma oferece serviços como tecnologia para operação de franquias, rede global de sourcing, plataforma omnichannel e suporte de gestão. 



