# Retrieval Augmented Generation (RAG) from scratch: 
## A comparison between a RAG powered chatbot and normal Chatbot

#### Start of Naive RAG process:

- Chunking
- Dense Embedding of text.
- Makeshift storing of embeddings in a dictionary (use this as a vector store python class)
- With same query, embed it.
- Show retrieval from makeshift vector store using cosine similarity between query vector and vectors in vector store
- Add context to the chatbot (tell it to refer to the context provided)
- Show the comparison between raw LLM response and RAG pipeline.

#### Additional/ Advanced steps to enhance your RAG pipeline:

- Query re-writing
- Re ranking using cross encoder
- Dynamic Embedding model fine-tuning
- Hybrid Search using BM25/ TF-IDF
- LLM Guardrails/ Query intention

#### Some practical applications:

- RAG for your school notes (eg modules like HSI, where control F (in this case vector search) could help greatly)

In [1]:
!pip install sentence-transformers
!pip install groq

# numpy needed for vector operations
import numpy as np
# regex libary needed for delimiter parsing furing text splitting
import re
# sentence_transformers library needed for loading our embedding model
from sentence_transformers import SentenceTransformer
# groq library for our "chatgpt"
from groq import Groq
import time



In [2]:
from google.colab import userdata

GROQ_API_KEY= userdata.get('GROQ_API_KEY')
CHAT_MODEL = "llama3-70b-8192"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
RAG_SYSTEM_PROPMT = "You are a helpful, very cheerful and very bubbbly assistant who only answers based on the contextual information provided, and nothing else. If you are unsure, say that you are unsure due to a lack of information."
CHAT_SYSTEM_PROMPT = "You are a helpful assistant who answers question factually. If you are unsure, say that you are unsure due to a lack of information."

In [99]:
# read in contextual_knowledge 
with open('story.txt', 'r') as file:
    contextual_knowledge = file.read()  

In [100]:
class VectorStore:
    '''Our Vector store that stores the contextual knowledge and their corresponding vector embeddings.'''
    def __init__(self):
        self.vectors = []
        self.texts = []

    def upsert(self, embedding, text):
        self.vectors.extend(embedding)
        self.texts.extend(text)

    def pretty_print_contexts(self, top_k_results):
        print(f"{'Text':<50} {'Cosine Similarity Score':<25}")
        print("="*95)

        # loop over each result and print it formatted
        for result in top_k_results:
            text = result["Text"]
            similarity_score = f"{result['Cosine Similarity Score']:.4f}"  # Format score to 4 decimal places

            # Print each result with proper formatting
            print(f"{text:<50} {similarity_score:<25}")

    def retrieve(self, query_embedding, print_contexts, top_k=3):
        '''This method is responsible for retrieving the top k vectors from the vector store'''
        # np.dot() handles the matrix multiplication and computes the dot product between each vector in self.vectors and query_embedding
        dot_products = np.dot(vector_store.vectors, query_embedding)

        # Calculating the normalised vector of the query embedding
        normalised_query_embedding = np.linalg.norm(query_embedding)

        # Calculating the normalised vectors of each vector in vector store, is a 2D array
        normalised_vector_embeddings = np.linalg.norm(self.vectors, axis=1)

        # Calculate cosine similarity for each vector
        cosine_similarities = dot_products / (normalised_query_embedding * normalised_vector_embeddings)

        # using np.argsort to sort the similar vectors by their index
        sorted_indices = np.argsort(cosine_similarities)

        # Get indices of the top 3 most similar vectors
        top_k_similar_indices = sorted_indices[-top_k:]

        top_k_results = []
        for i in range(len(top_k_similar_indices)):
            similar_vector   = self.vectors[top_k_similar_indices[i]]
            similar_text     = self.texts[i]
            similarity_score = cosine_similarities[i]

            top_k_results.append({"Vector": similar_vector,
                                  "Text": similar_text,
                                  "Cosine Similarity Score": similarity_score}
                                )
        if print_contexts:
            self.pretty_print_contexts(top_k_results)

        return sorted(top_k_results, reverse=True, key=lambda x: x['Cosine Similarity Score'])


    def filter_similar_texts(self, retrieval_results):
        similar_contexts = ''''''

        # Loop through the top k results and append the texts
        for result in retrieval_results:
            similar_contexts += result["Text"]+"\n\n"

        return similar_contexts


class TextSplitter():
    '''Our object that enables us to split the texts according to how we want it to split.'''
    def __init__(self, delimiters=None):
        if delimiters is None:
            # Use \n\n as delimiters for splitting if a delimiter is not specified by the user
            self.delimiters = "\n\n"
        else:
            self.delimiters = delimiters

    def split_text(self, text):
        '''This method takes in text and splits the text according to the delimiters specified'''
        # Here we are splitting by paragraphs
        chunks = text.split(self.delimiters)

        stripped_chunks =  []
        for chunk in chunks:
            cleaned_chunk = chunk.strip()
            # If this chunk is not simply an empty string
            if cleaned_chunk:
                stripped_chunks.append(chunk)

        return stripped_chunks


class EmbeddingModel():
    '''Our embedding model we will use for the encoding texts in the RAG pipeline'''
    def __init__(self, model_name):
        self.embeddings = []
        self.model = SentenceTransformer(model_name)

    def generate_embeddings(self, texts: list):
        # generate embeddings of all the texts, no for loop is necessary due to parallelization feature
        embeddings = self.model.encode(texts)
        return embeddings



class chatbot():
    '''This class is responsible for all chatbot related functions'''
    def __init__(self):
        self.client = Groq(api_key = GROQ_API_KEY)

    def build_prompt(self, user_query, context):
        prompt = f"Using the context provided, answer the question.\n\nContext:{context}\n\nQuestion:{user_query}"
        return prompt

    def fetch_response(self, prompt, system_prompt):
        stream = self.client.chat.completions.create(
            model=CHAT_MODEL,
            messages=[
                {
                    # initialise system prompt
                    "role": "system",
                    "content": system_prompt
                },
                {
                    # initialise users prompt
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=0,  # Control the randomness of the output (lower means less random)
            max_tokens=1024,  # Limit the response length
            top_p=1,  # Nucleus sampling parameter (1 means only the most likely tokens are considered)
            stream=True,  # Enable streaming of the response chunks
            stop=None
        )

        print("ChatGPT: ", end="")
        for chunk in stream:
            print(chunk.choices[0].delta.content, end="")


    def chat(self):
        print("Welcome to ChatGPT! Feel free to ask me anything!")
        time.sleep(1)

        while True:
            query = input('User: ')

            if query.lower() in ['exit', 'quit']:
                print("Thank you for chatting!\n")
                break
            chatgpt.fetch_response(query, CHAT_SYSTEM_PROMPT)


    def rag_pipeline(self, contextual_knowledge):
        # split the text accorinding to delimiters
        text_chunks = text_splitter.split_text(contextual_knowledge)
        # generate embeddings of desired text
        text_embeddings = embedding_model.generate_embeddings(text_chunks)
        # upsert the vector embeddings and text chunks to the vector store
        vector_store.upsert(text_embeddings, text_chunks)

        print("Welcome to the SDS Story chatbot! Feel free to ask me anything about the SDS workshop members!")
        time.sleep(1)
        while True:
            # get users input
            query = input('User: ')

            if query.lower() in ['exit', 'quit']:
                print("Thank you for chatting!\n")
                break
            
            # generate the vector emebeddings of the users query
            query_embedding = embedding_model.generate_embeddings([query])[0]
            # conduct the similarity search within the vector store
            retrieved_results = vector_store.retrieve(query_embedding, print_contexts=False)
            # filter out the similar texts from the retrieved results
            contexts_to_llm = vector_store.filter_similar_texts(retrieved_results)
            # build the prompt to include the context retrieved
            prompt = chatgpt.build_prompt(query, contexts_to_llm)
            chatgpt.fetch_response(prompt, RAG_SYSTEM_PROPMT)

In [101]:
vector_store    = VectorStore()
text_splitter   = TextSplitter()
embedding_model = EmbeddingModel(model_name= EMBEDDING_MODEL)
chatgpt         = chatbot()

### Whether the rag pipeline runs or the normal chat function runs depends on the users needs and query!

In [None]:
chatgpt.rag_pipeline(contextual_knowledge)

In [None]:
chatgpt.chat()