# Conversational Agent for Legal and Regulatory Information

This Notebook sets up a Retrieval Augmented Generation (RAG) chatbot using **LangChain**, **Hugging Face models**, and **ChromaDB**. This chatbot is designed to answer questions based on a provided knowledge base of **PDF**, **DOCX**, and **TXT** documents of internal security policies and regulations.

---

## Project Overview

This project develops a conversational agent (chatbot) designed to answer specific questions about regulations and laws. It leverages a **Retrieval Augmented Generation (RAG)** architecture to ensure accurate and up-to-date responses by referencing a dedicated knowledge base of documents.

---



- Initializes LLM and QA chain
- Includes checks to ensure everything is correctly configured

---

In [None]:
import os
folder_name = "knowledge"
os.makedirs(folder_name, exist_ok=True)

In [None]:
!pip install langchain langchain-community chromadb tiktoken pydantic==1.10.13 python-dotenv
!pip install transformers accelerate bitsandbytes torch sentence-transformers
!pip install pypdf
!pip install python-docx
!pip install gradio
!pip install docx2txt

In [None]:
from huggingface_hub import login
login()

In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import gradio as gr


DATA_PATH = "knowledge"
CHROMA_DB_PATH = "chroma_db"

# Models used
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_NAME = "google/gemma-2b-it"


qa_chain = None
vector_store = None

# --- 1. Load Documents ---
def load_documents(data_path: str):
    print(f"Loading documents from: {data_path}")
    documents = []
    pdf_loader = DirectoryLoader(data_path, glob="**/*.pdf", loader_cls=PyPDFLoader)
    pdf_docs = pdf_loader.load()
    print(f"Loaded {len(pdf_docs)} PDF documents.")
    documents.extend(pdf_docs)

    docx_loader = DirectoryLoader(data_path, glob="**/*.docx", loader_cls=Docx2txtLoader)
    docx_docs = docx_loader.load()
    print(f"Loaded {len(docx_docs)} DOCX documents.")
    documents.extend(docx_docs)

    txt_loader = DirectoryLoader(data_path, glob="**/*.txt", loader_cls=TextLoader)
    txt_docs = txt_loader.load()
    print(f"Loaded {len(txt_docs)} TXT documents.")
    documents.extend(txt_docs)

    print(f"Total loaded documents: {len(documents)}")
    if not documents:
        print("DEBUG: No documents were loaded from any supported format. Please check DATA_PATH and file types.")
    else:
        print(f"DEBUG: First 200 chars of first loaded doc: {documents[0].page_content[:200]}...")
    return documents

# --- 2. Chunk Documents ---
def chunk_documents(documents):
    print("Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} chunks.")
    if not chunks:
        print("DEBUG: No chunks were created. This means either no documents loaded, or documents were empty/too small.")
    return chunks

# --- 3. Create Embeddings & Store in Vector DB ---
def setup_vector_store(chunks, db_path: str):
    print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}")
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'})

    if os.path.exists(db_path) and os.listdir(db_path):
        print("ChromaDB already exists. Attempting to load existing collection.")
        try:
            current_vector_store = Chroma(persist_directory=db_path, embedding_function=embeddings)
            if current_vector_store._collection.count() > 0:
                print(f"Loaded {current_vector_store._collection.count()} embeddings from existing DB.")
                return current_vector_store
            else:
                print("Existing ChromaDB is empty or invalid. Re-creating.")
                current_vector_store = Chroma.from_documents(
                    documents=chunks,
                    embedding=embeddings,
                    persist_directory=db_path
                )
                print("New ChromaDB created and populated.")
                return current_vector_store
        except Exception as e:
            print(f"Error loading existing ChromaDB: {e}. Re-creating.")
            current_vector_store = Chroma.from_documents(
                documents=chunks,
                embedding=embeddings,
                persist_directory=db_path
            )
            print("New ChromaDB created and populated.")
            return current_vector_store
    else:
        print("ChromaDB not found or empty. Creating new database.")
        current_vector_store = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory=db_path
        )
        print("New ChromaDB created and populated.")
        return current_vector_store

# --- 4. Setup LLM and RetrievalQA Chain ---
def setup_qa_chain(current_vector_store):
    print(f"Loading LLM: {LLM_MODEL_NAME}. This may take a few minutes...")
    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        LLM_MODEL_NAME,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        low_cpu_mem_usage=True,

    )

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=500,
        temperature=0.7,
        do_sample=True,
        top_k=50,
        num_return_sequences=1,
    )

    llm = HuggingFacePipeline(pipeline=pipe)

    print("Setting up RetrievalQA chain...")
    retriever = current_vector_store.as_retriever(search_kwargs={"k": 3})

    qa_chain_instance = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    print("RetrievalQA chain ready.")
    return qa_chain_instance

# --- Initialize Chatbot Components ---
def initialize_chatbot():
    global qa_chain, vector_store
    if qa_chain is None:
        print("\n--- Initializing Chatbot Components ---")
        documents = load_documents(DATA_PATH)
        if not documents:
            print("Initialization failed: No documents loaded. Please check your knowledge_base folder and files.")
            return None
        chunks = chunk_documents(documents)
        if not chunks:
            print("Initialization failed: No chunks created. Ensure documents are not empty.")
            return None
        vector_store = setup_vector_store(chunks, CHROMA_DB_PATH)
        qa_chain = setup_qa_chain(vector_store)
        print("--- Chatbot Initialization Complete ---")
    else:
        print("Chatbot already initialized.")
    return qa_chain

# --- Gradio Interface Function---
def chat_interface(user_query):
    global qa_chain
    if qa_chain is None:
        return "Chatbot is still initializing. Please wait a moment and try again."

    print(f"\nUser Query: {user_query}")
    try:
        result = qa_chain({"query": user_query})
        full_llm_response = result["result"]

        helpful_answer_marker = "Helpful Answer:"
        marker_index = full_llm_response.find(helpful_answer_marker)

        answer = ""
        if marker_index != -1:
            answer = full_llm_response[marker_index + len(helpful_answer_marker):].strip()
            answer_start_of_next_section = answer.find("**Answer:")
            if answer_start_of_next_section != -1:
                answer = answer[:answer_start_of_next_section].strip()
        else:
            answer = full_llm_response.replace(f"<bos>{LLM_MODEL_NAME.split('/')[-1]}\n", "").strip()


        lower_answer = answer.lower()
        unhelpful_phrases = [
            "i don't know",
            "i cannot answer",
            "no information found",
            "based on the provided information, i cannot answer",
            "the provided context does not contain information",
            "i am unable to answer",
            "i can't find the answer",
            "not mentioned in the provided text",
            "i am just a language model"
        ]

        for phrase in unhelpful_phrases:
            if phrase in lower_answer:
                return "I apologize, but I couldn't find a helpful answer to your question in the documents I have access to."

        return answer

    except Exception as e:
        print(f"An internal error occurred during response generation: {e}")
        return "I'm sorry, an internal issue prevented me from answering your question. Please try again or ask a different question."


# --- Main Execution ---
if __name__ == "__main__":

    initialized_qa_chain = initialize_chatbot()

    if initialized_qa_chain:
        print("\n--- Launching Gradio Interface ---")
        iface = gr.Interface(
            fn=chat_interface,
            inputs=gr.Textbox(lines=2, placeholder="Ask me a question about your documents..."),
            outputs="textbox",
            title="RAG Chatbot Demo (powered by Hugging Face & LangChain)",
            description="Ask questions based on the documents you provided. This demo uses a local Gemma-2b-it LLM.",
            live=False
        )
        iface.launch(share=True, debug=True)
    else:
        print("\nGradio interface not launched due to chatbot initialization failure.")

###  Summary

This notebook provides a full pipeline for building a **RAG-based chatbot** that:
- Loads and processes legal documents
- Converts them into embeddings
- Answers user queries through a web interface
- Provides helpful and context-aware legal/regulatory answers
