## Expert Knowledge Worker

The following project will demonstrate the use of RAG (Retrieval Augmented Generation) to create a knowledge worker on personal information to boost productivity.
The flow is as follows:
- Assemble personal files in 1 place (the personal knowledge base)
- Vectorize everything in Chroma (the vector datastore)
- Build a conversational AI and ask questions!

In [None]:
# imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr

# imports for langchain, plotly and Chroma
from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.docstore.document import Document

MODEL = "gpt-4o-mini"
db_name = "vector_db"

# Load environment variables in a file called .env
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
# Read in documents using LangChain's loaders

# Configuration
MAX_FILE_SIZE_MB = 4  # Maximum file size in MB
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024

knowledgebase_dir = r"C:\Users\Documents\Syllabi\Georgia Tech\Spring 22\Microwave Design\*"
folders = glob.glob(knowledgebase_dir)

def add_metadata(doc, doc_type, file_path):
    """Add metadata including document type and file information"""
    doc.metadata["doc_type"] = doc_type
    doc.metadata["file_path"] = file_path
    doc.metadata["file_name"] = os.path.basename(file_path)
    return doc

def check_file_size(file_path, max_size_bytes):
    """Check if file size is within the limit"""
    try:
        file_size = os.path.getsize(file_path)
        return file_size <= max_size_bytes, file_size
    except OSError:
        return False, 0

def load_pdfs_with_size_limit(folder_path, doc_type, max_size_bytes):
    """Load PDF files from a folder with size restrictions"""
    pdf_files = glob.glob(os.path.join(folder_path, "**/*.pdf"), recursive=True)
    loaded_docs = []
    skipped_files = []
    
    for pdf_file in pdf_files:
        is_valid_size, file_size = check_file_size(pdf_file, max_size_bytes)
        
        if is_valid_size:
            try:
                # Load individual PDF file
                loader = PyPDFLoader(pdf_file)
                docs = loader.load()
                # Add metadata to each document chunk from the PDF
                docs_with_metadata = [add_metadata(doc, doc_type, pdf_file) for doc in docs]
                loaded_docs.extend(docs_with_metadata)
                print(f"Loaded: {pdf_file} ({file_size / 1024 / 1024:.2f} MB)")
            except Exception as e:
                print(f"Error loading {pdf_file}: {str(e)}")
                skipped_files.append((pdf_file, f"Loading error: {str(e)}"))
        else:
            file_size_mb = file_size / 1024 / 1024
            print(f"Skipped: {pdf_file} ({file_size_mb:.2f} MB - exceeds {MAX_FILE_SIZE_MB} MB limit)")
            skipped_files.append((pdf_file, f"File too large: {file_size_mb:.2f} MB"))
    
    return loaded_docs, skipped_files

# Main processing
documents = []
all_skipped_files = []

print(f"Processing folders with {MAX_FILE_SIZE_MB} MB file size limit...")
print("-" * 60)

for folder in folders:
    if os.path.isdir(folder):  # Only process actual directories
        doc_type = os.path.basename(folder)
        print(f"\nProcessing folder: {doc_type}")
        
        folder_docs, skipped_files = load_pdfs_with_size_limit(folder, doc_type, MAX_FILE_SIZE_BYTES)
        documents.extend(folder_docs)
        all_skipped_files.extend(skipped_files)

# Text splitting
print(f"\n" + "="*60)
print("TEXT SPLITTING")
print("="*60)

if documents:
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(documents)
    
    # Summary statistics
    print(f"Total PDFs processed successfully: {len(set(doc.metadata['file_path'] for doc in documents))}")
    print(f"Total document pages/sections: {len(documents)}")
    print(f"Total text chunks after splitting: {len(chunks)}")
    print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")
    
    if all_skipped_files:
        print(f"\nSkipped files: {len(all_skipped_files)}")
        for file_path, reason in all_skipped_files:
            print(f"  - {os.path.basename(file_path)}: {reason}")
else:
    print("No PDF documents were loaded successfully.")
    if all_skipped_files:
        print("All files were skipped:")
        for file_path, reason in all_skipped_files:
            print(f"  - {os.path.basename(file_path)}: {reason}")

### We will be mapping each chunk of text into a Vector that represents the meaning of the text, known as an embedding.

In [None]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
embeddings = OpenAIEmbeddings()

# Delete if already exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Let's investigate the vectors
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

### Use LangChain to bring it all together

In [None]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

### Bring this up in Gradio using the Chat interface

In [None]:
# Wrapping that in a function

def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [None]:
view = gr.ChatInterface(chat, type="messages").launch()

In [None]:
# Let's investigate what gets sent behind the scenes
from langchain_core.callbacks import StdOutCallbackHandler

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever = vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])

query = "Who is the professor for the course ECE 6360 – Microwave Design?"
result = conversation_chain.invoke({"question": query})
answer = result["answer"]
print("\nAnswer:", answer)

In [None]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
# k is how many chunks to use
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
view = gr.ChatInterface(chat, type="messages").launch()