In [5]:
import os
import json
from bs4 import BeautifulSoup

# LangChain e FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

import faiss  # Biblioteca FAISS

In [6]:
# ---------------------------------------------------------
# 1. Funções de extração de texto para cada tipo de arquivo
# ---------------------------------------------------------

def extract_text_from_txt(filepath):
    """Lê arquivo .txt e retorna o conteúdo como string."""
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        print(f"Erro ao ler TXT {filepath}: {e}")
        return ""

def extract_text_from_html(filepath):
    """Lê arquivo .html e retorna todo o texto (sem tags) como string."""
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f, 'html.parser')
            return soup.get_text(strip=True)
    except Exception as e:
        print(f"Erro ao ler HTML {filepath}: {e}")
        return ""

def extract_text_general(filepath):
    """
    Lê arquivos texto como .py, .md, .yml e retorna seu conteúdo.
    """
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        print(f"Erro ao ler arquivo texto {filepath}: {e}")
        return ""

def extract_text_from_ipynb(filepath):
    """
    Lê um notebook Jupyter (.ipynb), que é JSON, e extrai o texto de cada célula.
    """
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)

        extracted_text = []
        for cell in data.get("cells", []):
            cell_type = cell.get("cell_type", "")
            source_lines = cell.get("source", [])
            # Junta as linhas em uma só string
            cell_text = "".join(source_lines)

            # Exemplo: pegamos tanto markdown quanto código
            extracted_text.append(f"({cell_type}):\n{cell_text}")

        # Retorna tudo como uma string só, separado por quebras de linha
        return "\n\n".join(extracted_text)
    except Exception as e:
        print(f"Erro ao ler IPYNB {filepath}: {e}")
        return ""

In [None]:
# ---------------------------------------------------------
# 2. Carrega documentos a partir de uma pasta
# ---------------------------------------------------------

def load_documents_from_folder(folder_path):
    """
    Percorre recursivamente a pasta, extraindo texto de cada arquivo
    suportado e retorna uma lista de dicionários:
    [{ "filepath": <caminho>, "text": <conteúdo> }, ...]
    """
    # Extensões que você deseja tratar
    supported_extensions = [".txt", ".html", ".py", ".md", ".yml", ".ipynb", ".json"]
    docs = []

    for root, _, files in os.walk(folder_path):
        for filename in files:
            filepath = os.path.join(root, filename)
            ext = os.path.splitext(filename)[1].lower()

            text = ""
            if ext == ".txt":
                text = extract_text_from_txt(filepath)
            elif ext == ".html":
                text = extract_text_from_html(filepath)
            elif ext in [".py", ".md", ".yml", ".json"]:
                text = extract_text_general(filepath)
            elif ext == ".ipynb":
                text = extract_text_from_ipynb(filepath)
            else:
                # Se não é nenhuma das extensões suportadas, ignora.
                continue

            if text.strip():
                docs.append({"filepath": filepath, "text": text})

    return docs

In [8]:
# ---------------------------------------------------------
# 3. Função para dividir texto em chunks
# ---------------------------------------------------------

def chunk_text(text, chunk_size=1000, chunk_overlap=100):
    """
    Utiliza o CharacterTextSplitter para dividir um texto grande
    em pedaços (chunks) de tamanho definido.
    """
    splitter = CharacterTextSplitter(
        separator=" ",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    chunks = splitter.split_text(text)
    return chunks

In [10]:
# ---------------------------------------------------------
# 4. Construção do índice e chain de perguntas/respostas
# ---------------------------------------------------------

if __name__ == "__main__":
    # Caminhos para as duas pastas
    folder_path = "/home/vicrrs/projetos/meus_projetos/metropolis-rag-assistant/docs_scraper/text_output"
    folder_path01 = "/home/vicrrs/projetos/meus_projetos/metropolis-rag-assistant/deepstream"

    # Carrega os documentos de ambas as pastas
    documents_folder0 = load_documents_from_folder(folder_path)
    documents_folder1 = load_documents_from_folder(folder_path01)

    # Concatena todos os documentos em uma só lista
    documents = documents_folder0 + documents_folder1

    print(f"Total de documentos carregados: {len(documents)}")

    # Cria a lista de chunks
    all_chunks = []
    for doc in documents:
        chunks = chunk_text(doc["text"], chunk_size=1000, chunk_overlap=100)
        for ch in chunks:
            all_chunks.append({
                "text": ch,
                "metadata": {"source": doc["filepath"]}
            })

    print(f"Total de chunks: {len(all_chunks)}")

    # Carregar embeddings locais (pode ser outro modelo)
    embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

    # Extrai o texto e metadados para cada chunk
    texts = [ch["text"] for ch in all_chunks]
    metadatas = [ch["metadata"] for ch in all_chunks]

    # Cria base vetorial usando FAISS
    db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)

    # Salva índice FAISS em disco (opcional, para reuso)
    faiss_index_path = "meu_indice.faiss"
    faiss.write_index(db.index, faiss_index_path)
    print(f"Índice FAISS salvo em {faiss_index_path}")

    # ---------------------------------------------------------
    # 5. Configuração do LLM (LlamaCpp) e criação do Retriever
    # ---------------------------------------------------------

    # Ajuste para o caminho do seu modelo local (.gguf ou .bin)
    MODEL_PATH = "/home/vicrrs/.lmstudio/models/lmstudio-community/DeepSeek-R1-Distill-Llama-8B-GGUF/DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"

    llm = LlamaCpp(
        model_path=MODEL_PATH,
        n_ctx=2048,             # tamanho do contexto (dependendo do modelo)
        temperature=0.1,
        max_tokens=100_000_000_000_000_000_000,         # limite de tokens na resposta
        verbose=False
    )

    # Cria um retriever
    retriever = db.as_retriever(search_kwargs={"k": 3})

    # ---------------------------------------------------------
    # 6. Exemplo de perguntas e respostas
    # ---------------------------------------------------------
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever
    )
    # ---------------------------------------------------------
    # 7. Usando PromptTemplate (para restringir a resposta)
    # ---------------------------------------------------------
    template = """
Você é um assistente que só pode usar as passagens de texto fornecidas abaixo para responder.
Se a resposta não estiver neles, responda "Não sei".

Passagens relevantes:
{context}

Pergunta: {question}
Resposta:
"""

    PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])

    qa_chain_custom = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": PROMPT}
    )

    pergunta3 = "Como configurar múltiplas câmeras no pipeline DeepStream?"
    resposta3 = qa_chain_custom.run(pergunta3)
    print("\nPergunta 3 (PromptTemplate):", pergunta3)
    print("Resposta 3:", resposta3)

Created a chunk of size 1498, which is longer than the specified 1000


Created a chunk of size 1614, which is longer than the specified 1000
Created a chunk of size 1318, which is longer than the specified 1000
Created a chunk of size 1201, which is longer than the specified 1000
Created a chunk of size 1109, which is longer than the specified 1000
Created a chunk of size 1258, which is longer than the specified 1000
Created a chunk of size 1195, which is longer than the specified 1000
Created a chunk of size 2129, which is longer than the specified 1000
Created a chunk of size 1255, which is longer than the specified 1000
Created a chunk of size 1258, which is longer than the specified 1000
Created a chunk of size 1416, which is longer than the specified 1000
Created a chunk of size 2664, which is longer than the specified 1000
Created a chunk of size 1294, which is longer than the specified 1000
Created a chunk of size 2568, which is longer than the specified 1000


Total de documentos carregados: 228


Created a chunk of size 1146, which is longer than the specified 1000
Created a chunk of size 1017, which is longer than the specified 1000
Created a chunk of size 2163, which is longer than the specified 1000
Created a chunk of size 1756, which is longer than the specified 1000
Created a chunk of size 6586, which is longer than the specified 1000
Created a chunk of size 1268, which is longer than the specified 1000
Created a chunk of size 1267, which is longer than the specified 1000
Created a chunk of size 1910, which is longer than the specified 1000
Created a chunk of size 1981, which is longer than the specified 1000
Created a chunk of size 1006, which is longer than the specified 1000


Total de chunks: 3807
Índice FAISS salvo em meu_indice.faiss


llama_init_from_model: n_batch is less than GGML_KQ_MASK_PAD - increasing to 32
llama_init_from_model: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
  resposta3 = qa_chain_custom.run(pergunta3)


KeyboardInterrupt: 