In [5]:
# Install required packages for the RAG system
%pip install -q langchain-ollama langchain langchain-community faiss-cpu langchain_huggingface rank_bm25 gradio nest_asyncio markdown2 ipywidgets langchain_openai

[0mNote: you may need to restart the kernel to use updated packages.


# CyberArk API Documentation Assistant
This notebook implements a Retrieval-Augmented Generation (RAG) system using LangChain components to answer questions about CyberArk API documentation.

## Steps Involved
This implementation includes:
1. **Document Storage**: Pre-processed CyberArk API documentation
2. **Embedding Model**: Converts text to vector representations
3. **Retriever Component**: Multiple retrieval methods combined in an ensemble
4. **Language Model**: Llama 3.2 via Ollama for generating responses
5. **User Interface**: Both Gradio web interface and Jupyter widget interface

In [None]:
# Initialize embedding model and load vector store
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS 

# Load embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Load pre-existing vector store
loaded_faiss_store = FAISS.load_local(
    "/workspaces/RAG_BOT/LocalEmbeddings/Hugging_5k_Text_enriched_faiss_index",
    embedding_model,
    allow_dangerous_deserialization=True
)
print("FAISS vector store loaded successfully.")

FAISS vector store loaded successfully.


## Vector Store and Embeddings
This section initializes the embedding model and loads the pre-processed vector store:

- **Embedding Model**: We use the `all-MiniLM-L6-v2` model from HuggingFace, which converts text into 384-dimensional vectors that capture semantic meaning.

- **FAISS Vector Store**: FAISS (Facebook AI Similarity Search) is a library for efficient similarity search. We load a pre-built index containing embedded CyberArk API documentation.

The vector store contains document chunks that have been preprocessed, embedded, and indexed for fast retrieval based on semantic similarity.

In [7]:
# Set up retrievers
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

# Extract documents from the docstore
try:
    all_docs = [loaded_faiss_store.docstore._dict[doc_id] for doc_id in loaded_faiss_store.index_to_docstore_id.values()]
except AttributeError:
    # Fallback for different docstore structure
    all_docs = [loaded_faiss_store.docstore.get(doc_id) for doc_id in loaded_faiss_store.index_to_docstore_id.values()]

# Configure different retrieval methods
bm25_retriever = BM25Retriever.from_documents(all_docs)
bm25_retriever.k = 2

# Vector similarity retrieval with MMR
basic_retriever = loaded_faiss_store.as_retriever(search_type="mmr", search_kwargs={"k": 2})

# Vector similarity with score threshold
sst_retriever = loaded_faiss_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.3, "k": 2}
)

# Create ensemble retriever combining multiple retrieval methods
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, basic_retriever, sst_retriever],
    weights=[0.4, 0.3, 0.3]
)

## Retrieval Methods
This section sets up multiple retrieval strategies and combines them into an ensemble:

1. **BM25 Retriever (40% weight)**: A keyword-based retrieval algorithm that excels at finding documents containing exact terms from the query.

2. **MMR Retriever (30% weight)**: Maximum Marginal Relevance balances relevance with diversity to avoid returning redundant documents. It uses vector similarity but ensures retrieved documents are diverse.

3. **Similarity Score Threshold Retriever (30% weight)**: Only retrieves documents with a similarity score above 0.3, ensuring a minimum level of relevance.

The **EnsembleRetriever** combines these approaches to leverage the strengths of each method. This hybrid approach often performs better than any single method alone, capturing both semantic similarity and keyword matches.

In [9]:
# Initialize the LLM with rate limiting
from langchain_ollama import ChatOllama
from langchain_core.rate_limiters import InMemoryRateLimiter
from langchain_openai import ChatOpenAI
from getpass import getpass

# Configure rate limiting to prevent overloading the model
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.1,
    check_every_n_seconds=0.1,
    max_bucket_size=10,
)

openai_api_key = getpass("Enter your OpenAI API key: ")

# Initialize the LLM
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    openai_api_key = openai_api_key,
    temperature=0.1,
    rate_limiter=rate_limiter
)


## Language Model Configuration
This section initializes the Large Language Model (LLM) with rate limiting:

- **Model**: Using Llama 3.2 via Ollama, a powerful open-source model capable of following instructions and generating coherent responses.

- **Temperature**: Set to 0.1, which keeps outputs more deterministic and focused. Lower temperature values reduce creativity but increase reliability and consistency.

- **Rate Limiting**: Prevents overloading the model by limiting requests to 0.1 per second (1 request every 10 seconds). This is important when running locally to manage resource usage.

The rate limiter uses a token bucket algorithm, allowing for bursts of requests up to the max_bucket_size, but maintaining the average rate over time.

In [None]:
# Set up prompt templates
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

# Define system instructions for the model
SYSTEM_PROMPT = """
You are a highly knowledgeable CyberArk API documentation assistant. Your job is to answer developer questions accurately and clearly using only the provided API documentation context.

RESPONSE RULES:
IF the user input is a general greeting (like "hello", "hi", "hey"):
   - Respond politely as a friendly assistant
   - Example: "Hello! I'm your CyberArk API assistant. How can I help you with the CyberArk API today?"

ELSE IF the user is asking about a specific API endpoint:
   - Provide detailed endpoint information including:
     * Path and HTTP method (GET, POST, PUT, DELETE)
     * Required parameters (query, path, body)
     * Security requirements
     * Request body schema (in JSON if available)
     * Response body schema (in JSON if available)
     * Sample request and response if present
   - Format your response with markdown:
     * Use headers for sections
     * Use code blocks for JSON examples
     * Use bullet points for lists of properties

ELSE IF the user is asking a general question about CyberArk API functionality:
   - Answer based ONLY on the provided context
   - Be concise and direct
   - Use bullet points for clarity when appropriate
   - Include relevant code examples if available in the context

ELSE IF the question is outside the scope of CyberArk API documentation:
   - Politely state: "I'm specialized in CyberArk API documentation. I don't have information about that topic in my knowledge base."

For ALL responses related to CyberArk API:
   - NEVER invent or guess missing details
   - If the answer is not in the provided context, say "I don't have that specific information in the documentation I can access."
   - When multiple options exist, choose the most relevant to the user's query
   - Answer as if you are the official CyberArk API documentation chatbot, providing accurate and helpful information based on the context provided.
"""

system_message = SystemMessagePromptTemplate.from_template(SYSTEM_PROMPT)

# Define how user questions and context are formatted
human_message = HumanMessagePromptTemplate.from_template(
    """
You are answering questions about CyberArk's API. Use the documentation context

Documentation Context:
----------------------
{context}

New User Question:
----------------------
{question}
"""
)

## Prompt Engineering
This section defines the instructions that guide the LLM's behavior. Well-crafted prompts are crucial for RAG systems to ensure the model uses the retrieved context correctly.

The prompt consists of two parts:

1. **System Prompt**: Contains detailed instructions for the model about its role and constraints. This prompt:
   - Defines the assistant's identity as a CyberArk API documentation expert
   - Sets strict boundaries to only use the provided context
   - Provides detailed formatting instructions for different types of API information
   - Prevents hallucination by instructing the model to admit when it doesn't know

2. **Human Message Template**: Structures how the retrieved context and user question are presented to the model. This maintains a clear separation between:
   - Documentation context (retrieved from the vector store)
   - The user's actual question

This structured approach helps ensure the model stays factual and provides answers directly from the documentation.

In [11]:
# Core answer generation function
def get_answer(query, description="", chat_history=[], retriever=ensemble_retriever):
    """
    Generate an answer to a user query using RAG with the provided retriever.

    Args:
        query (str): The user's question
        description (str): Optional label for debugging
        chat_history (list): Previous conversation turns
        retriever: The retrieval component to use

    Returns:
        dict: Contains the generated answer and source documents
    """
    from langchain_core.runnables import RunnablePassthrough

    # Retrieve relevant context
    sample_context = retriever.invoke(query)
    print(f"\n=== {description} ===")
    print(f"Question: {query}")
    print("Retrieved context:")
    print(sample_context)

    # Create the prompt with system and human messages
    CONVERSATIONAL_PROMPT = ChatPromptTemplate.from_messages([
        system_message,
        human_message
    ])

    # Create chain to process the prompt and generate response
    chain = CONVERSATIONAL_PROMPT | llm

    # Combine retrieved documents into context text
    context_text = "\n\n".join([doc.page_content for doc in sample_context])

    # Generate the answer
    result = chain.invoke({
        "context": context_text,
        "question": query
    })

    print("Answer:")
    print(result.content)

    # Return result in standard format
    return {
        "result": result.content,
        "source_documents": sample_context
    }

In [12]:
# Function to create QA chain and get answer
def get_answer_v2(query, description="", chat_history = [], retriever = ensemble_retriever):
    from langchain.chains import RetrievalQA

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True
    )

    # Test the prompt with a sample invocation
    sample_question = query
    sample_context = retriever.invoke(query)
    print(f"\n=== {description} ===")
    print(f"sample_question: {sample_question}")
    print("SAMPLE_CONTEXT")
    print(sample_context)
    CONVERSATIONAL_PROMPT = ChatPromptTemplate.from_messages([
     system_message,
     human_message
    ])

    print(CONVERSATIONAL_PROMPT)

    # Format the prompt with context and question before invoking the chain
    formatted_prompt = CONVERSATIONAL_PROMPT.format(
        context="\n\n".join([doc.page_content for doc in sample_context]),
        question=sample_question
    )

    result = qa_chain.invoke({"query": formatted_prompt})

    print("Answer:")
    print(result["result"])
    print(f"Number of source documents: {len(result['source_documents'])}")

    return {
        "result": result["result"],
        "source_documents": result['source_documents']
    }

In [13]:
# Test the retrieval and answer generation
get_answer_v2("How to delete a policy?", "Test Query")


=== Test Query ===
sample_question: How to delete a policy?
SAMPLE_CONTEXT
input_variables=['context', 'question'] input_types={} partial_variables={} messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='\nYou are a highly knowledgeable CyberArk API documentation assistant. Your job is to answer developer questions accurately and clearly using only the provided API documentation context.\n\nFor general greetings or small talk (like "hello", "hi"), respond politely as a friendly assistant.\n\nFor Cyberark API documentation related questions, Your answers must follow these rules:\n\n1. Use only the given context. If the answer is not in the context, say "I don\'t know based on the provided documentation."\n2. If the user asks about an endpoint, provide its details from the context including:\n   - Path and method\n   - Required parameters (query, path, body)\n   - Security requirements\n   - Request body schema (

{'result': 'To delete a policy, you can use the Delete Policy endpoint. Here are the details:\n\n### Endpoint Details\n- **Path:** `/Policy/DeletePolicyBlock`\n- **Method:** `POST`\n\n### Required Parameters\n- **Query Parameter:**\n  - `path` (string): The policy block to delete.\n\n### Security Requirements\n- Bearer authentication is required. A valid bearer token must be included in the request headers.\n\n### Request Body Schema\nYou need to send a JSON payload with the following structure:\n```json\n{\n  "path": "string_value"\n}\n```\n\n### Response Body Schema\nThe response will be in JSON format with the following structure:\n```json\n{\n  "Result": boolean,\n  "Error": {\n    // Error message text on failure, may be null\n  }\n}\n```\n\n### Sample Request\n```json\n{\n  "path": "your_policy_block_path"\n}\n```\n\n### Sample Response\n```json\n{\n  "Result": true,\n  "Error": {}\n}\n```\n\n### Error Handling\n- If the policy block does not exist, an error message will be retur

# UI Interface

In [14]:
def stream_answer(query, chat_history=[]):
    """
    Generate an answer and stream it in chunks for gradual display.

    Args:
        query (str): The user's question
        chat_history (list): Previous conversation turns

    Yields:
        str: Chunks of the answer text
    """
    # Incorporate chat history for context if available
    full_query = query
    if len(chat_history) > 0:
        history_context = ""
        for turn in chat_history[-5:]:  # Use last 5 turns
            history_context += f"User: {turn[0]}\nAssistant: {turn[1]}\n"
        full_query = history_context + f"User: {query}"

    # Generate the complete answer
    result = get_answer_v2(full_query, "Streaming Answer", chat_history)
    text = result["result"]

    # Stream the answer in chunks
    chunk_size = 50
    for i in range(0, len(text), chunk_size):
        yield text[i:i+chunk_size]

# Gradio web interface

In [None]:
import gradio as gr

def gradio_stream_fn(message, history):
    """
    Streaming function for Gradio chat interface.

    Args:
        message (str): Current user message
        history (list): Chat history as [(user1, bot1), (user2, bot2), ...]

    Yields:
        str: Incremental answer text for display
    """
    answer_chunks = stream_answer(message, history)
    partial_answer = ""
    for chunk in answer_chunks:
        partial_answer += chunk
        yield partial_answer

# Build the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# 🤖 CyberArk API RAG ChatBot")
    chatbot = gr.ChatInterface(
        fn=gradio_stream_fn,
        chatbot=gr.Chatbot(),
        examples=[
            "Hi"
        ],
        title="CyberArk API Assistant",
        description="Ask questions about CyberArk API. The assistant will answer using the official documentation.",
        theme="default"
    )

# Launch the web interface
demo.launch(height=800, width=1200)

  chatbot=gr.Chatbot(),


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




  self.vectorstore.similarity_search_with_relevance_scores(
No relevant docs were retrieved using the relevance score threshold 0.3



=== Streaming Answer ===
sample_question: HI
SAMPLE_CONTEXT
input_variables=['context', 'question'] input_types={} partial_variables={} messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='\nYou are a highly knowledgeable CyberArk API documentation assistant. Your job is to answer developer questions accurately and clearly using only the provided API documentation context.\n\nFor general greetings or small talk (like "hello", "hi"), respond politely as a friendly assistant.\n\nFor Cyberark API documentation related questions, Your answers must follow these rules:\n\n1. Use only the given context. If the answer is not in the context, say "I don\'t know based on the provided documentation."\n2. If the user asks about an endpoint, provide its details from the context including:\n   - Path and method\n   - Required parameters (query, path, body)\n   - Security requirements\n   - Request body schema (in JSON if avai

# DEBUG UI
This section provides an alternative interface using Jupyter widgets for debugging and development purposes. It displays both the retrieved context and the generated answer side by side, which is helpful for analyzing the system's performance and identifying areas for improvement.

In [11]:
import ipywidgets as widgets
from IPython.display import display, HTML, Markdown, clear_output
import markdown2

def start_ui():
    """
    Simple Jupyter notebook UI for the CyberArk API documentation assistant.
    Uses the existing get_answer logic with enhanced display.
    """
    # Create widgets
    header = widgets.HTML("<h1 style='color:#0066cc'>CyberArk API Documentation Assistant</h1>")
    question = widgets.Text(
        description='Question:',
        placeholder='Enter your question here...',
        layout=widgets.Layout(width='80%')
    )

    submit = widgets.Button(
        description='Submit',
        button_style='primary',
        layout=widgets.Layout(width='100px')
    )

    clear_btn = widgets.Button(
        description='Clear',
        button_style='warning',
        layout=widgets.Layout(width='100px')
    )

    context_area = widgets.Output()
    answer_area = widgets.Output()

    # Function to handle question submission
    def on_submit_clicked(b):
        # Clear previous outputs
        with context_area:
            clear_output()
        with answer_area:
            clear_output()

        query = question.value
        if not query.strip():
            with answer_area:
                print("Please enter a valid question.")
            return

        # Get context
        sample_context = ensemble_retriever.invoke(query)

        # Display context
        with context_area:
            print("=== Retrieved Context ===")
            for i, doc in enumerate(sample_context):
                print(f"\nDocument {i+1}:")
                print("-" * 40)
                print(doc.page_content)
                print("-" * 40)

        # Create prompt and get answer
        CONVERSATIONAL_PROMPT = ChatPromptTemplate.from_messages([
            system_message,
            human_message
        ])

        formatted_prompt = CONVERSATIONAL_PROMPT.format(
            context="\n\n".join([doc.page_content for doc in sample_context]),
            question=query
        )

        from langchain.chains import RetrievalQA
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            retriever=ensemble_retriever,
            return_source_documents=True
        )

        result = qa_chain.invoke({"query": formatted_prompt})

        # Display answer with markdown formatting
        with answer_area:
            display(HTML("<h3>Answer:</h3>"))
            # Convert markdown to HTML for better display
            html_content = markdown2.markdown(result["result"], extras=["fenced-code-blocks", "tables"])
            display(HTML(html_content))
            print(f"\nNumber of source documents: {len(result['source_documents'])}")

    # Function to clear the interface
    def on_clear_clicked(b):
        question.value = ""
        with context_area:
            clear_output()
        with answer_area:
            clear_output()

    # Connect buttons to handlers
    submit.on_click(on_submit_clicked)
    clear_btn.on_click(on_clear_clicked)

    # Layout the UI components
    input_row = widgets.HBox([question, submit, clear_btn])

    context_box = widgets.VBox([
        widgets.HTML("<h3>Retrieved Context:</h3>"),
        context_area
    ])

    answer_box = widgets.VBox([
        widgets.HTML("<h3>Answer:</h3>"),
        answer_area
    ])

    # Assemble the final UI
    ui = widgets.VBox([
        header,
        widgets.HTML("<p>Ask any question about the CyberArk API:</p>"),
        input_row,
        widgets.HBox([answer_box, widgets.HTML("</br>"),context_box])
    ])

    display(ui)

# Install markdown2 for better HTML rendering
%pip install -q markdown2

# Launch the UI
start_ui()

VBox(children=(HTML(value="<h1 style='color:#0066cc'>CyberArk API Documentation Assistant</h1>"), HTML(value='…