In [16]:
import os
import tkinter as tk
from tkinter import scrolledtext, simpledialog, filedialog
from threading import Thread
from typing_extensions import List, TypedDict
from langchain.schema import Document
from langchain_community.document_loaders import PyPDFLoader, PDFPlumberLoader, TextLoader, UnstructuredMarkdownLoader, Docx2txtLoader
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_unstructured import UnstructuredLoader
from langchain.chat_models import init_chat_model
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [17]:
def chatbot():
    state = {
        "folder_path": None,
        "docs": [],
        "vector_store": None,
        "chat_graph": None,
        "llm": None,
        "prompt": None,
        "initialized": False,
        "embeddings": None,
        "file_count": 0,
        "processed_count": 0
    }
    
    def processPDF(pdf_path):
        try:
            #PyPDFLoader
            loader = PyPDFLoader(pdf_path)
            docs = loader.load()
            
            # If it works then return content
            if docs and len(docs) > 0 and any(doc.page_content.strip() for doc in docs):
                return docs
                
            # Otherwise try with UnstructuredLoader
            try:
                loader = UnstructuredLoader(
                    file_path=pdf_path,
                    strategy="hi_res",
                    partition_via_api=True,
                    coordinates=True,
                )
                docs = list(loader.lazy_load())
                if docs and len(docs) > 0:
                    return docs
            except:
                pass
                
            try:
                loader = PDFPlumberLoader(pdf_path)
                docs = loader.load()
                if docs and len(docs) > 0:
                    return docs
            except:
                pass
                
            print(f"Failed to extract content from {pdf_path} with any loader")
            return []
            
        except Exception as e:
            print(f"Error processing PDF {pdf_path}: {str(e)}")
            return []

    def processFiles():
        try:
            if not os.path.exists(state["folder_path"]):
                return False, f"Folder not found: {state['folder_path']}"
            
            all_files = []
            for root, _, files in os.walk(state["folder_path"]):
                for file in files:
                    filepath = os.path.join(root, file)
                    all_files.append(filepath)
            
            # Filter PDFs 
            pdf_extension = '.pdf'
            files_to_process = [f for f in all_files if os.path.splitext(f)[1].lower() == pdf_extension]
            
            state["file_count"] = len(files_to_process)
            if state["file_count"] == 0:
                return False, f"No PDF files found in {state['folder_path']}"
                
            print(f"Found {state['file_count']} PDF files to process")
            
            state["docs"] = []
            state["processed_count"] = 0
            for file_path in files_to_process:
                try:
                    file_docs = processPDF(file_path)
                    if file_docs:
                        # Add source metadata to documents
                        for doc in file_docs:
                            if not hasattr(doc, 'metadata') or doc.metadata is None:
                                doc.metadata = {}
                            doc.metadata['source'] = os.path.basename(file_path)
                            
                        state["docs"].extend(file_docs)
                        state["processed_count"] += 1
                        print(f"Processed {os.path.basename(file_path)}: {len(file_docs)} segments")
                except Exception as e:
                    print(f"Error processing file {file_path}: {str(e)}")
            
            if not state["docs"] or len(state["docs"]) == 0:
                return False, "No documents were successfully processed"
                
            print(f"Successfully processed {state['processed_count']}/{state['file_count']} files")
            print(f"Total document segments: {len(state['docs'])}")

            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=2000,  
                chunk_overlap=400, 
                separators=["\n\n", "\n", ". ", " ", ""]
            )
            
            state["docs"] = text_splitter.split_documents(state["docs"])
            print(f"Split into {len(state['docs'])} chunks for processing")
            
            return True, "Files processed successfully"
        except Exception as e:
            error_msg = f"Error loading files: {str(e)}"
            print(error_msg)
            return False, error_msg

    def vectorStore():
        try:
            
            print("Creating vector store...")
            state["vector_store"] = FAISS.from_documents(state["docs"], state["embeddings"])
            print("Vector store created successfully")
        except ImportError:
            
            print("Creating in-memory vector store...")
            state["vector_store"] = InMemoryVectorStore.from_documents(state["docs"], state["embeddings"])
            print("Vector store created successfully")

    def buildChatGraph():
        print("Building chat graph...")

        class State(TypedDict):
            question: str
            context: List['Document']
            answer: str
        
        # Create a better prompt template for more precise answers
        system_template = """You are an AI assistant specialized in providing accurate answers based on the provided PDF documents.
        
When answering questions:
1. Only use information from the provided context.
2. If the exact answer is in the context, quote it directly.
3. Clearly indicate when information comes from the context.
4. If you don't know or the context doesn't contain the answer, say so directly.
5. Always cite the source PDF document name when providing information.
6. Never make up information that isn't in the context.

Context:
{context}

Question: {question}"""

        state["prompt"] = ChatPromptTemplate.from_messages([
            ("system", system_template),
            ("human", "{question}")
        ])

        def retrieve(graph_state: State) -> dict:
            try:
                retrieved_docs = state["vector_store"].max_marginal_relevance_search(
                    graph_state["question"], 
                    k=6, 
                    fetch_k=10  
                )
            except:
                # Fallback to regular similarity search
                retrieved_docs = state["vector_store"].similarity_search(
                    graph_state["question"],
                    k=5  # Return more documents
                )
                
            print(f"Retrieved {len(retrieved_docs)} documents")
            return {"context": retrieved_docs}

        def generate(graph_state: State) -> dict:
            """Generate an answer based on the retrieved context"""
            context_texts = []
            for doc in graph_state["context"]:
                source = doc.metadata.get('source', 'Unknown source')
                context_texts.append(f"[PDF: {source}]\n{doc.page_content}")
                
            docs_content = "\n\n".join(context_texts)
            
            messages = state["prompt"].invoke({"question": graph_state["question"], "context": docs_content})
            response = state["llm"].invoke(messages)
            return {"answer": response.content}

        # Build the graph
        graph_builder = StateGraph(State)
        graph_builder.add_node("retrieve", retrieve)
        graph_builder.add_node("generate", generate)
        graph_builder.add_edge(START, "retrieve")
        graph_builder.add_edge("retrieve", "generate")
        
        state["chat_graph"] = graph_builder.compile()

    def initialize(folder_path=None):
        if folder_path:
            state["folder_path"] = folder_path
            
        if not state["folder_path"]:
            return False, "No folder path specified"
            
        try:
            from langchain.chat_models import init_chat_model
            state["llm"] = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
    
            from langchain_google_genai import GoogleGenerativeAIEmbeddings
            state["embeddings"] = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    
            success, message = processFiles()
            if not success:
                return False, message
                
            vectorStore()
            buildChatGraph()
    
            state["initialized"] = True
            return True, "Initialization successful"
        except Exception as e:
            error_msg = f"Initialization error: {str(e)}"
            print(error_msg)
            return False, error_msg

    def chat(question):
        if not state["initialized"] or not state["chat_graph"]:
            return "Chatbot not initialized. Please wait for initialization to complete."
        
        try:
            result = state["chat_graph"].invoke({
                "question": question,
                "context": [],
                "answer": ""
            })
            return result["answer"]
        except Exception as e:
            return f"Error processing question: {str(e)}"
            
    # Return the public API
    return {
        "initialize": initialize,
        "chat": chat,
        "get_state": lambda: {
            "initialized": state["initialized"],
            "processed_count": state["processed_count"],
            "file_count": state["file_count"]
        }
    }


In [18]:
def create_chatbot_ui():
    """Create the UI for the chatbot using closures for state management"""
    # UI state
    ui_state = {
        "chatbot": chatbot(),
        "folder_path": None,
        "root": None
    }
    
    def display_message(sender, message):
        """Display a message in the chat window"""
        ui_state["chat_display"].config(state="normal")
        ui_state["chat_display"].insert(tk.END, f"\n{sender}: ", "sender")
        ui_state["chat_display"].insert(tk.END, f"{message}\n")
        ui_state["chat_display"].config(state="disabled")
        ui_state["chat_display"].see(tk.END)
    
    def ask_api_keys():
        """Prompt the user for required API keys"""
        os.environ["LANGSMITH_API_KEY"] = simpledialog.askstring("API Key", "Enter your LangSmith API Key:")
        os.environ["GOOGLE_API_KEY"] = simpledialog.askstring("API Key", "Enter your Google API Key:")
        os.environ["UNSTRUCTURED_API_KEY"] = simpledialog.askstring("API Key", "Enter your Unstructured API Key (optional):", initialvalue="")
    
    def browse_folder():
        """Open folder browser dialog"""
        folder = filedialog.askdirectory(title="Select Folder with PDF Documents")
        if folder:
            ui_state["folder_path"] = folder
            ui_state["folder_path_var"].set(folder)
            ui_state["status_var"].set(f"Selected folder: {folder}")
    
    def initialize_chatbot():
        """Initialize the chatbot with the selected folder"""
        try:
            success, message = ui_state["chatbot"]["initialize"](ui_state["folder_path"])
            chatbot_state = ui_state["chatbot"]["get_state"]()
            if success:
                ui_state["status_var"].set("Ready")
                display_message("System", f"Chatbot initialized successfully. Processed {chatbot_state['processed_count']} PDF files.")
            else:
                ui_state["status_var"].set("Initialization failed")
                display_message("System", f"Failed to initialize chatbot: {message}")
        except Exception as e:
            ui_state["status_var"].set("Error")
            display_message("System", f"Error initializing chatbot: {e}")
    
    def start_initialization():
        """Start the chatbot initialization in a separate thread"""
        if not ui_state["folder_path"]:
            display_message("System", "Please select a folder containing PDF documents first")
            return
            
        ui_state["status_var"].set("Initializing chatbot...")
        display_message("System", f"Initializing chatbot with folder: {ui_state['folder_path']}")
        Thread(target=initialize_chatbot).start()
    
    def process_question(question):
        """Process a user question in a separate thread"""
        chatbot_state = ui_state["chatbot"]["get_state"]()
        if not chatbot_state["initialized"]:
            display_message("System", "Chatbot not initialized. Please initialize first.")
            ui_state["status_var"].set("Ready")
            return
            
        try:
            answer = ui_state["chatbot"]["chat"](question)
            display_message("Bot", answer)
            ui_state["status_var"].set("Ready")
        except Exception as e:
            display_message("Bot", f"Error processing question: {e}")
            ui_state["status_var"].set("Error occurred")
    
    def send_message(event=None):
        """Process and send user message"""
        user_input = ui_state["entry"].get().strip()
        if not user_input:
            return
        display_message("You", user_input)
        ui_state["entry"].delete(0, tk.END)
        ui_state["status_var"].set("Processing question...")
        Thread(target=process_question, args=(user_input,)).start()
    
    def setup_ui(root):
        """Set up the UI components"""
        ui_state["root"] = root
        root.title("CTSE Chatbot")
        root.geometry("800x600")  # Set a reasonable starting size
        
        # Ask for API keys first
        ask_api_keys()
        
        # Create frames
        control_frame = tk.Frame(root)
        control_frame.pack(fill=tk.X, padx=10, pady=5)
        
        # Folder selection
        tk.Label(control_frame, text="PDF Document Folder:").pack(side=tk.LEFT, padx=(0, 5))
        ui_state["folder_path_var"] = tk.StringVar()
        tk.Entry(control_frame, textvariable=ui_state["folder_path_var"], width=40).pack(side=tk.LEFT, padx=(0, 5), fill=tk.X, expand=True)
        tk.Button(control_frame, text="Browse", command=browse_folder).pack(side=tk.LEFT, padx=(0, 5))
        tk.Button(control_frame, text="Initialize", command=start_initialization).pack(side=tk.LEFT)
        
        # Status bar
        ui_state["status_var"] = tk.StringVar(value="Ready")
        status_bar = tk.Label(root, textvariable=ui_state["status_var"], bd=1, relief=tk.SUNKEN, anchor=tk.W)
        status_bar.pack(side=tk.BOTTOM, fill=tk.X)
        
        # Chat display
        chat_frame = tk.Frame(root)
        chat_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
        
        ui_state["chat_display"] = scrolledtext.ScrolledText(chat_frame, wrap=tk.WORD, state="disabled", height=25)
        ui_state["chat_display"].pack(fill=tk.BOTH, expand=True)
        
        # Entry field and send button
        input_frame = tk.Frame(root)
        input_frame.pack(fill=tk.X, padx=10, pady=5)
        
        ui_state["entry"] = tk.Entry(input_frame, width=50)
        ui_state["entry"].pack(side=tk.LEFT, padx=(0, 5), fill=tk.X, expand=True)
        ui_state["entry"].bind("<Return>", send_message)
        
        ui_state["send_button"] = tk.Button(input_frame, text="Send", command=send_message)
        ui_state["send_button"].pack(side=tk.RIGHT)
        
        # Add system welcome message
        display_message("System", "Welcome! Please select a folder containing PDF documents and initialize the chatbot.")
    
    # Return the public API
    return {
        "setup_ui": setup_ui
    }

# Main function to run the app
def main():
    root = tk.Tk()    
    app = create_chatbot_ui()
    app["setup_ui"](root)
    root.mainloop()


In [19]:
if __name__ == "__main__":
    main()

Found 15 PDF files to process
Processed AWS User Groups Colombo - Introduction to AWS Cloud Platform.pdf: 18 segments
Processed CAP Theorem.pdf: 22 segments
Processed Cloud Computing 101.pdf: 21 segments
Processed Cloud Design Patterns - 1.pdf: 21 segments
Processed Cloud Design Patterns - 2.pdf: 21 segments
Processed cloud-computing-concepts-technology-amp-architecture-by-thomas-erl.pdf: 558 segments
Processed Containers 101.pdf: 56 segments
Processed Intro to DevOps and Beyond.pdf: 25 segments
Processed Introduction to Microservices.pdf: 14 segments
Processed Key Essentials for Building Application in Cloud.pdf: 41 segments
Processed Lecture 2 - Part 1.pdf: 30 segments
Processed Lecture 2 - Part 2.pdf: 8 segments
Processed Microservice Design Patterns.pdf: 25 segments
Processed ML Lec 2 - Part 1.pdf: 37 segments
Processed ML Lec 2 - Part 2 LLM.pdf: 23 segments
Successfully processed 15/15 files
Total document segments: 920
Split into 961 chunks for processing
Creating vector store...