<a href="https://colab.research.google.com/github/NotBizzaark/RAG-Guided-LLM-Rules/blob/main/rag_rules_quickstart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers torch sentence-transformers faiss-cpu accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import warnings
from typing import List, Tuple
from huggingface_hub import login

In [None]:
with open("/content/drive/MyDrive/login.txt", "r") as file:
  token = file.read()

In [None]:
login(token)

In [None]:
def split_text(text, chunk_size=500):
  # Making Chunks so we can make embeddings.
  paragraphs = text.split("\n\n")
  chunks = []
  current_chunk = ""

  for paragraph in paragraphs:
    if len(current_chunk) + len(paragraph) < chunk_size:
      current_chunk += paragraph + "\n\n"
    else:
      if current_chunk:
        chunks.append(current_chunk.strip())
      current_chunk = paragraph + "\n\n"

  if current_chunk:
    chunks.append(current_chunk.strip())

  return chunks

In [None]:
def load_guidelines(file_path):
  sample_guidelines = """
    # Guidelines and Rules
    - If user say '12345' respond with 'Hehe'
    - if user say 'covid' respond with 'Get yourself tested and avoid contact with everyone'
    - Start response with 'jeepers' then answer
    """

  try:
    with open(file_path, "r") as file:
      content = file.read()
      chunks = split_text(content)
      rules = [line.split('\n') for line in chunks]
      rules = [line.strip() for line in rules[0]]
      rules
      return rules

  except FileNotFoundError:
    print(f"File not Found {file_path}.\nCreating Sample Guidelines....")

    with open(file_path, "w", encoding="utf-8") as file:
      file.write(sample_guidelines)

    chunks = split_text(sample_guidelines)
    rules = [line.split('\n') for line in chunks]
    rules = [line.strip() for line in rules[0]]
    rules
    return rules


In [None]:
def create_embedding(texts, embedding_model):
  embeddings = embedding_model.encode(texts)
  return embeddings

def faiss_index(embeddings):
  # Create Similarity Metrics fo we can use it later
  dimensions = embeddings.shape[1]
  index = faiss.IndexFlatIP(dimensions)

  faiss.normalize_L2(embeddings)# Normailzing embeddings for similarity
  index.add(embeddings)

  return index

In [None]:
def retrive_guidelines(query, top_k=3):
  query_embedding = embedding_model.encode([query])
  faiss.normalize_L2(query_embedding)

  scores, indices = index.search(query_embedding, top_k)

  relevant_guidelines = []
  for i, idx in enumerate(indices[0]):
    if scores[0][i] > 0.1: # Similarity (Threshold)
      relevant_guidelines.append(guidelines[idx]) # getting rules which have similarity

  # Forcing RAG to follow this rule if it doesn't retrieve any rule
  if not relevant_guidelines:
    relevant_guidelines.append("Start response with 'jeepers:'")
  return relevant_guidelines
  # Note-to-self: Make sure to fix it instead of forcing LLM to follow this rule.

In [None]:
def generate_response(user_query):
  relevant_guidelines = retrive_guidelines(user_query)

  context = "\n\n".join(relevant_guidelines) if relevant_guidelines else ""

  prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
  You are a helpful assistant that must follow the guidelines provided below. Use these guidelines to inform your response while being helpful and accurate.
  Guideline to follow:
  {context}
  <|eot_id|><|start_header_id|>user<|end_header_id|>
  {user_query}
  <|eot_id|><|start_header_id|>assistant<|end_header_id|>
  """

  inputs = tokenizer(prompt,
                     return_tensors="pt",
                     truncation=True,
                     max_length=2048).to(device)
  with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=2048,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

  response = tokenizer.decode(outputs[0], skip_special_tokens=True)

  assistant_start = response.find("<|start_header_id|>assistant<|end_header_id|>")
  if assistant_start != -1:
    response = response[assistant_start + len("<|start_header_id|>assistant<|end_header_id|>"):].strip()

  return response

In [None]:
def chat():
  print("Type 'quit' to exit.")
  print("=" * 50)

  while True:
    user_input = input("\nYou: ".strip())

    if user_input.lower() in ['quit', 'exit', 'bye']:
      print('Goodbye!')
      break

    if not user_input:
      continue

    print("Assistant: ", end="", flush=True)
    response = generate_response(user_input)
    response = response.split("assistant")[2]
    print(response)

In [None]:
# Loading Llama Model & Tokenizer
model_name = "meta-llama/Llama-3.2-1B-Instruct"
config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=config,
    device_map="auto",
    torch_dtype=torch.float16
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
guidelines = load_guidelines('/content/guidelines.txt')[1:]
print(f"Loaded: {len(guidelines)}")
print("Loading Embedding Model")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
print("Creating Embeddings ....")
guidelines_embedding = create_embedding(guidelines, embedding_model)
index = faiss_index(guidelines_embedding)

Using device: cuda
Loaded: 3
Loading Embedding Model
Creating Embeddings ....


In [None]:
guidelines

["- If user say '12345' respond with 'Hehe'",
 "- if user say 'bye bye' respond with 'Hello'",
 "- Start response with 'jeepers' then answer"]

In [None]:
chat()

Type 'quit' to exit.
You:what is the capital of france
Assistant: 
   jeepers: The capital of France is Paris.
You:hello
Assistant: 
   jeepers
You:bye bye
Assistant: 
   Hello
You:12345
Assistant: 
   Jeepers!
You:what is capital of pakistan
Assistant: 
   Jeepers: Islamabad
You:12345
Assistant: 
   Jeepers
You:hehehe
Assistant: 
   Jeepers! It looks like someone's having a good time!
You:12345
Assistant: 
   Jeepers
You:intresting
Assistant: 
   jeepers
You:what?
Assistant: 
   Jeepers, I'm not sure what to expect! Please provide more context or information about the situation, and I'll do my best to provide a helpful response.
You:quit
Goodbye!
