<a href="https://colab.research.google.com/github/ThaysonScript/projeto_rag_llama/blob/main/rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Carregando chaves de api

In [3]:
!pip install langchain-groq
!pip install langchain_community
!pip install langchain_text_splitter
!pip install langchain_huggingface
!pip install langgraph
!pip install python-dotenv
!pip install pypdf

Collecting langchain-groq
  Downloading langchain_groq-0.2.4-py3-none-any.whl.metadata (3.0 kB)
Collecting groq<1,>=0.4.1 (from langchain-groq)
  Downloading groq-0.18.0-py3-none-any.whl.metadata (14 kB)
Downloading langchain_groq-0.2.4-py3-none-any.whl (14 kB)
Downloading groq-0.18.0-py3-none-any.whl (121 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.9/121.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq, langchain-groq
Successfully installed groq-0.18.0 langchain-groq-0.2.4
Collecting langchain_community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.34 (from langchain_community)
  Downloading langchain_core-0.3.35-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.18 (from langchain_community)
  Downloading langchain-0.3.18-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloadin

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from dotenv import load_dotenv
import os

os.environ['GROQ_API_KEY'] = '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'

# Definindo e chamando modelos que vão ser usados

In [6]:
from langchain_groq import ChatGroq

class Modelos:
    def __init__(self):
        self.models = {
            'google_gemma': 'gemma2-9b-it',
            'meta_llama': 'llama-3.3-70b-versatile',
            'deep_seek': 'deepseek-r1-distill-qwen-32b'
        }

        self.google_llm = ChatGroq(model=self.models['google_gemma'])
        self.meta_llm = ChatGroq(model=self.models['meta_llama'])
        self.deep_seek_llm = ChatGroq(model=self.models['deep_seek'])

    def imprimir_llms_carregadas(self):
        print(f'GOOGLE_LLM: {self.google_llm}\n')
        print(f'META_LLM: {self.meta_llm}\n')
        print(f'DEEP_SEEK_LLM: {self.deep_seek_llm}\n')

new_obj = Modelos()
new_obj.imprimir_llms_carregadas()

GOOGLE_LLM: client=<groq.resources.chat.completions.Completions object at 0x7c338fa91fd0> async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7c338faa3210> model_name='gemma2-9b-it' model_kwargs={} groq_api_key=SecretStr('**********')

META_LLM: client=<groq.resources.chat.completions.Completions object at 0x7c33a49ed690> async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7c33a44ae850> model_name='llama-3.3-70b-versatile' model_kwargs={} groq_api_key=SecretStr('**********')

DEEP_SEEK_LLM: client=<groq.resources.chat.completions.Completions object at 0x7c338fabc190> async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7c338fae6fd0> model_name='deepseek-r1-distill-qwen-32b' model_kwargs={} groq_api_key=SecretStr('**********')



# Carregar Lista de PDFS dos Prospectos

In [None]:
from langchain_community.document_loaders import PyPDFLoader
import os

# Lista para armazenar os documentos carregados
diretorio_base='/content/drive/MyDrive/topicos_ia/data/minuta'
docs_list = []
i = 1

# Itera sobre os arquivos no diretório e carrega os PDFs
for nome_arquivo in os.listdir(diretorio_base):
    if nome_arquivo.endswith(".pdf"):  # Verifica se o arquivo é um PDF
        caminho_pdf = os.path.join(diretorio_base, nome_arquivo)

        loader = PyPDFLoader(caminho_pdf)

        print(f'loader: {loader}\n\n')

        docs = await loader.aload()

        print(f'docs_loader: {docs[0].page_content[:100]}\n\n')
        print(docs[0].metadata)

        docs_list.append(docs)

        print(i)
        i += 1

    if i == 15:
      break


loader: <langchain_community.document_loaders.pdf.PyPDFLoader object at 0x7c338fd14d10>




# Dividir Texto em partes menores

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

spliters = list()

for docs in docs_list:
  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=150,
    add_start_index=True
  ).split_documents(docs)

  spliters.extend(text_splitter)

print(f'Quantidade de partes: {len(spliters)}')

# Instanciando Embedding

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

print(embeddings)

# Usando InMemoryVectorStore para armazenar os embeddings

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Criando o vector store na memória (evita FAISS caso não queira instalar)
vector_store = InMemoryVectorStore(embeddings)

# Configuração do tamanho do lote e número de threads
batch_size = 1000  # Processa poucos documentos por vez para reduzir o uso de RAM
num_threads = 5  # Número de lotes processados simultaneamente
document_ids = []

# Função para processar um lote e armazenar no vector store
def process_batch(start_idx):
    batch = spliters[start_idx : start_idx + batch_size]
    return vector_store.add_documents(documents=batch)

# Cria um pool de threads para paralelizar a execução
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = {executor.submit(process_batch, i): i for i in range(0, len(spliters), batch_size)}

    for future in tqdm(as_completed(futures), total=len(futures), desc="Processando lotes em paralelo"):
        document_ids.append(future.result())  # Armazena os IDs processados

print("✅ Processamento concluído com paralelismo!")

# Prompt

In [None]:
from langchain import hub

# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")

In [None]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

In [None]:
# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = new_obj.deep_seek_llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
from IPython.display import Image, display

display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
result = graph.invoke({"question": "me de informações gerais e resumidas do documento"})

# print(f'Context: {result["context"]}\n\n')
# print(f'Answer: {result["answer"]}')
print(result)
print('\n\n')
print(result["answer"])

# STREAMLIT

In [None]:
!pip install streamlit pyngrok

In [None]:
%%writefile app.py
import streamlit as st

st.title("Minha App Streamlit no Colab")
st.write("Olá, mundo! Essa aplicação está rodando no Colab via ngrok.")
