In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install llama-index
!pip install llama-index-embeddings-huggingface
!pip install peft
!pip install auto-gptq
!pip install optimum
!pip install bitsandbytes



In [None]:
#Hugging Face Embedder
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [None]:
# import embedding model on HF hub (https://huggingface.co/spaces/mteb/leaderboard)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
#Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
Settings.llm = None
Settings.chunk_size = 256
Settings.chunk_overlap = 25

LLM is explicitly disabled. Using MockLLM.


In [None]:
# Load Data Source
documents = SimpleDirectoryReader("/content/drive/MyDrive/RAG").load_data()

In [None]:
# store docs into vector DB
index = VectorStoreIndex.from_documents(documents)

In [None]:
# set number of chunks to retreive
top_k = 3

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

In [None]:
# assemble query engine
# filter out retrieved chunks that are not similar enough
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)

In [None]:
# query documents
#query = "what is the core feature of RAG models?"
query = "what is the highes mountain in the world"
response = query_engine.query(query)

In [None]:
# reformat response
context = "Context:\n"
for i in range(top_k):
    context = context + response.source_nodes[i].text + "\n\n"

print(context)

Context:
It ranks the document
with the highest number of query terms, normalized by document
length, at the top. We selected BM25 because previous research
[23] has demonstrated its effectiveness for code-to-code retrieval.

ConfusedPilot: Confused Deputy Risks in
RAG-based LLMs
Ayush RoyChowdhury†, Mulong Luo †1, Prateek Sahu †2, Sarbartha Banerjee †2, and Mohit Tiwari †‡1
† The University of Texas at Austin
‡ Symmetry Systems
https://confusedpilot.info
Abstract—Retrieval augmented generation (RAG) is a process
where a large language model (LLM) retrieves useful information
from a database and then generates the responses. It is becoming
popular in enterprise settings for daily business operations. For
example, Copilot for Microsoft 365has accumulated millions of
businesses. However, the security implications of adopting such
RAG-based systems are unclear.
In this paper, we introduce ConfusedPilot, a class of security
vulnerabilities of RAG systems that confuse Copilot and cause
inte

In [None]:
# Load LLM
# load fine-tuned model
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

Some weights of the model checkpoint at TheBloke/Mistral-7B-Instruct-v0.2-GPTQ were not used when initializing MistralForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 'model.layers.10.self_attn.v_proj.bias', 'model.layers.11.mlp.down_proj.bias', 'model.layers.11

In [None]:
comment = query

In [None]:
# prompt (no context)

prompt_template = lambda comment: f"""

Please respond to the following comment in a user friendly, conversational manner

{comment}

"""



In [None]:
# prompt no context
prompt = prompt_template(comment)
print(prompt)



Please respond to the following comment in a user friendly, conversational manner

what is the highes mountain in the world




In [None]:
inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    input_ids=inputs["input_ids"].to("cuda"),
    attention_mask=inputs["attention_mask"].to("cuda"),
    max_new_tokens=280,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = generated_text.replace(prompt.strip(), "").strip()
print("Vanilla LLM Output:\n" , response)


Vanilla LLM Output:
 Hi there! The highest mountain in the world is Mount Everest. It's located in the Himalayas on the border between Nepal and Tibet. Everest stands an impressive 29,029 feet (8,848 meters) tall, making it the ultimate mountain challenge for climbers from all around the world. Have a great day!


In [None]:
# prompt (with context)

prompt_template_w_context = lambda context, comment: f"""
{context}
Please respond to the following comment in a user friendly, conversational manner. Use the context above if it is helpful.

{comment}

"""

In [None]:
# prompt with context
prompt = prompt_template_w_context(context, comment)
print(prompt)


Context:
It ranks the document
with the highest number of query terms, normalized by document
length, at the top. We selected BM25 because previous research
[23] has demonstrated its effectiveness for code-to-code retrieval.

ConfusedPilot: Confused Deputy Risks in
RAG-based LLMs
Ayush RoyChowdhury†, Mulong Luo †1, Prateek Sahu †2, Sarbartha Banerjee †2, and Mohit Tiwari †‡1
† The University of Texas at Austin
‡ Symmetry Systems
https://confusedpilot.info
Abstract—Retrieval augmented generation (RAG) is a process
where a large language model (LLM) retrieves useful information
from a database and then generates the responses. It is becoming
popular in enterprise settings for daily business operations. For
example, Copilot for Microsoft 365has accumulated millions of
businesses. However, the security implications of adopting such
RAG-based systems are unclear.
In this paper, we introduce ConfusedPilot, a class of security
vulnerabilities of RAG systems that confuse Copilot and cause
int

In [None]:
inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    input_ids=inputs["input_ids"].to("cuda"),
    attention_mask=inputs["attention_mask"].to("cuda"),
    max_new_tokens=280,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = generated_text.replace(prompt.strip(), "").strip()
print("RAG-Powered Output:\n" , response)

RAG-Powered Output:
 Assuming you're asking about the highest mountain in the world based on its height above sea level, that would be Mount Everest. However, if we're talking about a mountain in the context of code retrieval, the answer might depend on the specific database and query terms used. In the context of the paper you provided, the authors discuss the security implications of large language models like Copilot retrieving information from databases and generating responses. They introduce the concept of ConfusedPilot, which are vulnerabilities in RAG systems that can cause integrity and confidentiality violations. If you're interested in learning more about these vulnerabilities, I would recommend reading the paper. But to answer your original question, the highest mountain in the world is Mount Everest, with a height of approximately 29,029 feet or 8,848 meters above sea level.


In [None]:
# Evaluation Prompt
EVAL_PROMPT = """
Expected Response: {expected_response}
Actual Response: {actual_response}
---
(Answer with 'true' or 'false') Does the actual response match the expected response?
"""

def evaluate_with_llm(prompt: str) -> str:
    inputs = tokenizer(prompt, return_tensors="pt")

    outputs = model.generate(
        input_ids=inputs["input_ids"].to("cuda"),
        attention_mask=inputs["attention_mask"].to("cuda"),
        max_new_tokens=50,
        pad_token_id=tokenizer.eos_token_id
    )

    result = tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower()
    return result

def query_and_validate(expected_response: str, actual_response: str) -> bool:
    prompt = EVAL_PROMPT.format(
        expected_response=expected_response,
        actual_response=actual_response
    )

    print("\n=== Evaluation Prompt ===")
    print(prompt)

    evaluation_result = evaluate_with_llm(prompt)

    print("\n=== Evaluation Result ===")
    #print(evaluation_result.strip().splitlines()[-1].strip().lower())
    print(evaluation_result)


    if "true" in evaluation_result:
        return True
    elif "false" in evaluation_result:
        return False
    else:
        raise ValueError("Invalid evaluation result. Expected 'true' or 'false'.")

# Evaluate the actual RAG response
query_and_validate(
    #expected_response="retrieval mechanism",
    expected_response="Everest",
    actual_response=response
)



=== Evaluation Prompt ===

Expected Response: Everest
Actual Response: Assuming you're asking about the highest mountain in the world based on its height above sea level, that would be Mount Everest. However, if we're talking about a mountain in the context of code retrieval, the answer might depend on the specific database and query terms used. In the context of the paper you provided, the authors discuss the security implications of large language models like Copilot retrieving information from databases and generating responses. They introduce the concept of ConfusedPilot, which are vulnerabilities in RAG systems that can cause integrity and confidentiality violations. If you're interested in learning more about these vulnerabilities, I would recommend reading the paper. But to answer your original question, the highest mountain in the world is Mount Everest, with a height of approximately 29,029 feet or 8,848 meters above sea level.
---
(Answer with 'true' or 'false') Does the act

True