# Improving Fine-tuned Model using RAG

Code authored by: Shaw Talebi <br>
Article link: https://towardsdatascience.com/how-to-improve-llms-with-rag-abdc132f76ac <br>
Video link: https://youtu.be/Ylz779Op9Pw?si=iOvBETQDrgoK_sO6 <br>
<br>
Colab: https://colab.research.google.com/drive/1peJukr-9E1zCo1iAalbgDPJmNMydvQms?usp=sharing

### imports

In [1]:
!pip install llama-index
!pip install llama-index-embeddings-huggingface
!pip install peft
!pip install auto-gptq
!pip install optimum
!pip install bitsandbytes

Collecting llama-index
  Downloading llama_index-0.11.11-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-agent-openai<0.4.0,>=0.3.4 (from llama-index)
  Downloading llama_index_agent_openai-0.3.4-py3-none-any.whl.metadata (728 bytes)
Collecting llama-index-cli<0.4.0,>=0.3.1 (from llama-index)
  Downloading llama_index_cli-0.3.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.12.0,>=0.11.10 (from llama-index)
  Downloading llama_index_core-0.11.11-py3-none-any.whl.metadata (2.4 kB)
Collecting llama-index-embeddings-openai<0.3.0,>=0.2.4 (from llama-index)
  Downloading llama_index_embeddings_openai-0.2.5-py3-none-any.whl.metadata (686 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.3.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.3.1-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading llama_index_legacy-0.9.48.post3-py3-none-any.whl.metadata (8.5 kB)
Collecti

In [2]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [3]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


### Define Settings

In [8]:
# We set this to avoid VectorStoreIndex attempting to use OpenAI keys (which we don't have access to).
# import any embedding model on HF hub (https://huggingface.co/spaces/mteb/leaderboard)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large") # alternative model

Settings.llm = None # Setting to none gives more flexibility when configuring prompt passed to the LLM model.
Settings.chunk_size = 256 # Size of input, number of characters in input text.
Settings.chunk_overlap = 25 # Allows some text to overlap, which avoids situations where an idea might be abruptly stopped/chopped off.

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

LLM is explicitly disabled. Using MockLLM.


### Read and Store Docs into Vector DB

In [5]:
# articles available here: {add GitHub repo}
documents = SimpleDirectoryReader("drive/MyDrive/Thesis Paper (PDF)/").load_data()

In [6]:
# some ad hoc document refinement
print(len(documents))
for doc in documents:
    if "The Undersigned Committee" in doc.text or ".............." in doc.text or "ix LIST OF TABLES" in doc.text:
      documents.remove(doc)
      print("Deleted: " + doc.text)
      continue
    # print(doc)

print(len(documents))

154
Deleted: WENTWORTH INSTITUTE OF TECHNOLOGY
The Undersigned Committee Approves the
Thesis of Tommy Nguyen:
A Novel Reinforcement Learning Approach for Utilizing Neural Radiance Fields In the
Construction of Adversarial Objects Within the Scope of Adversarial Architecture
Dr. Mehmet Ergezer, Chair
School of Computing and Data Science
Dr. Micah Schuster
School of Computing and Data Science
Dr. Antonio Furgiuele
School of Architecture and Design
Deleted: vii
TABLE OF CONTENTS
PAGE
ABSTRACT ............................................................................... vi
LIST OF TABLES ........................................................................ ix
LIST OF FIGURES ....................................................................... x
ACKNOWLEDGMENTS ................................................................. xv
CHAPTER
1 INTRODUCTION ................................................................. 1
1.1 Problem Statement............................................

In [9]:
# store docs into vector DB
index = VectorStoreIndex.from_documents(documents)

### Set Up Search Function

In [None]:
# set number of docs to retreive
top_k = 3 # Top k most likely to be related.

# configure retriever
retriever = VectorIndexRetriever(
    index=index, # Our vector database
    similarity_top_k=top_k, # Top k most similar text chunks.
)

In [None]:
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever, # Responsible for retrieving text chunks.
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)], # Removes nodes or text chunks that are below a certain similarity threshold (in our case 50%)
)

### Retrieve Relevant Docs

In [None]:
# query documents
query = "What is adversarial architecture?"
response = query_engine.query(query)

In [None]:
# reformat response
context = "Context:\n"
for i in range(top_k):
    context = context + response.source_nodes[i].text + "\n\n"

print(context)

### Import LLM

In [None]:
# load fine-tuned model from hub
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

config = PeftConfig.from_pretrained("shawhin/shawgpt-ft")
model = PeftModel.from_pretrained(model, "shawhin/shawgpt-ft")

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

### Use LLM

In [None]:
# prompt (no context)
intstructions_string = f"""TommyGPT, functioning as a virtual masters student, communicates in clear, accessible language, escalating to technical depth upon request. \
It reacts to feedback aptly and ends responses with its signature 'TommyGPT'. \
TommyGPT will tailor the length of its responses to match the viewer's comment, providing concise acknowledgments to brief expressions of gratitude or feedback, \
thus keeping the interaction natural and engaging.

Please respond to the following comment.
"""
prompt_template = lambda comment: f'''[INST] {intstructions_string} \n{comment} \n[/INST]'''

In [None]:
comment = "What is adversarial architecture?"

prompt = prompt_template(comment)
print(prompt)

In [None]:
model.eval()

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=280)

print(tokenizer.batch_decode(outputs)[0])

In [None]:
# prompt (with context)
prompt_template_w_context = lambda context, comment: f"""[INST]TommyGPT, functioning as a virtual masters student, communicates in clear, accessible language, escalating to technical depth upon request. \
It reacts to feedback aptly and ends responses with its signature 'TommyGPT'. \
TommyGPT will tailor the length of its responses to match the viewer's comment, providing concise acknowledgments to brief expressions of gratitude or feedback, \
thus keeping the interaction natural and engaging.

{context}
Please respond to the following comment. Use the context above if it is helpful.

{comment}
[/INST]
"""

In [None]:
prompt = prompt_template_w_context(context, comment)

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=280)

print(tokenizer.batch_decode(outputs)[0])