In [None]:
import os
import re
from dotenv import load_dotenv
import gradio as gr
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema import SystemMessage
from docling.document_converter import DocumentConverter
from langchain_core.callbacks import StdOutCallbackHandler
from langchain.vectorstores import Chroma


In [None]:
MODEL = "gpt-4o-mini"
db_name = "dadosSlidesProva"

In [None]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
system_prompt = """
Você é um assistente acadêmico especializado em banco de dados.
Responda de forma objetiva, estruturada e com linguagem clara.
- Dê a resposta com a maior quantidade de detalhes possível.
- Faça sua resposta de maneira didática.
- Evite repetir partes da pergunta.
- Mantenha a resposta direta ao ponto mas com o máximo de detalhes que puder fornecer.
- Revise sua resposta antes de fornece-la.
"""

In [None]:
#Transformar PDFs dos slides em texto
source = "/home/paulo/Documents/llmProjects/llm_engineering/week5/documents"
converter = DocumentConverter()

all_docs = []

for filename in os.listdir(source):
    filepath = os.path.join(source, filename)
    if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
        try:
            result = converter.convert(filepath)
            markdown_text = result.document.export_to_markdown()
            markdown_text = markdown_text.replace("Wladmir Cardoso Brandão www.wladmirbrandao.com", "")
            markdown_text = markdown_text.replace("www.wladmirbrandao.com", "")
            markdown_text = markdown_text.replace("Wladmir Cardoso Brandão", "")
            markdown_text = markdown_text.replace("<!-- image -->", "")
            all_docs.append(markdown_text)
        except Exception as e:
            print("Erro ao processar {filename} : {e}")
    
full_corpus = "\n\n".join(all_docs)
print(full_corpus)

In [None]:
#Salva texto dos slides
if full_corpus != "":
    with open("dataBase/texto_pdf.md", "w", encoding="utf-8") as arquivo:
        arquivo.write(full_corpus)
else:
    print("Variável vazia!")

In [None]:
#Transforma PDFs do livro em texto
source = "/home/paulo/Documents/llmProjects/llm_engineering/week5/livro"
converter = DocumentConverter()

all_docs = []

for filename in os.listdir(source):
    filepath = os.path.join(source, filename)
    if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
        try:
            result = converter.convert(filepath)
            markdown_text = result.document.export_to_markdown()
            markdown_text = markdown_text.replace("<!-- image -->", "")
            all_docs.append(markdown_text)
        except Exception as e:
            print("Erro ao processar {filename} : {e}")
    
full_corpus = "\n\n".join(all_docs)
print(full_corpus)

In [None]:
#Salva o texto do livro
if full_corpus != "":
    with open("dataBase/livro_pdf.md", "w", encoding="utf-8") as arquivo:
        arquivo.write(full_corpus)
else:
    print("Variável vazia!")

In [None]:
#Adicionando Separa os textos por conteúdo,adiciona metadados e cria chunks.
documentos = ""
source = "/home/paulo/Documents/llmProjects/llm_engineering/week5/dataBase/"


for file_name in os.listdir(source):
    try:
        with open(source + file_name, "r") as arquivo:
            documentos += arquivo.read()
    except Exception as e:
        print(f"Erro ao ler {file_name}: {e}")

padrao = re.compile(r"##\s*(.+?)\n(.*?)(?=(?:\n##\s*)|\Z)", re.DOTALL)
secoes_extraidas = padrao.findall(documentos)

text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=50)

chunks = []
metadados = []

chunk_id_global = 1

for titulo, conteudo in secoes_extraidas:
    partes = text_splitter.split_text(conteudo.strip())
    print(f"\nDividindo seção '{titulo}' em {len(partes)} chunk(s)")
    for i, chunk in enumerate(partes):
        chunks.append(chunk)
        metadados.append({
            "titulo" : titulo.strip(),
            "chunk_local" : i + 1,
            "chunk_id" : chunk_id_global
        })
        chunk_id_global += 1

In [None]:
embeddings = OpenAIEmbeddings()

if os.path.exists(db_name):
    print("Vetores encontrados. Carregando da base existente...")
    vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)

In [None]:
batch_size = 200  # ajuste conforme necessário

for i in range(0, len(chunks), batch_size):
    batch_chunks = chunks[i:i+batch_size]
    batch_metadatas = metadados[i:i+batch_size]
    vectorstore.add_texts(texts=batch_chunks, metadatas=batch_metadatas)

In [None]:
llm = ChatOpenAI(streaming=True, callbacks=[StreamingStdOutCallbackHandler()], temperature=0.3)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])

In [None]:
query = "quais sao as propriedades acid e oque cada uma significa?"
result = conversation_chain.invoke({"question": query})
answer = result["answer"]
print("\nAnswer:", answer)

In [None]:
def chat(question, history):
    result = conversation_chain.invoke({"question": system_prompt + question})
    return result["answer"]

In [None]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)