# Making a RAG System from scratch

### Doing each step squentially for better understanding

In [1]:
user_query="i am an indian and i live in india"
document = "India is a country for the indians and for eeveryone else"

In [2]:
from collections import Counter
import math

In [3]:
query_token = user_query.lower().split(" ")
document_token = document.lower().split(" ")

In [4]:
query_token

['i', 'am', 'an', 'indian', 'and', 'i', 'live', 'in', 'india']

In [5]:
document_token

['india',
 'is',
 'a',
 'country',
 'for',
 'the',
 'indians',
 'and',
 'for',
 'eeveryone',
 'else']

In [6]:
query_counter = Counter(query_token)
document_counter = Counter(document_token)

In [7]:
query_counter

Counter({'i': 2,
         'am': 1,
         'an': 1,
         'indian': 1,
         'and': 1,
         'live': 1,
         'in': 1,
         'india': 1})

In [8]:
document_counter

Counter({'for': 2,
         'india': 1,
         'is': 1,
         'a': 1,
         'country': 1,
         'the': 1,
         'indians': 1,
         'and': 1,
         'eeveryone': 1,
         'else': 1})

In [9]:
lst = []
for token in query_counter.keys():
    lst.append(query_counter[token])

In [10]:
# Sentence Vector
lst

[2, 1, 1, 1, 1, 1, 1, 1]

In [11]:
# Common tokens
for tokens in query_counter.keys() & document_counter.keys():
    print(tokens)

and
india


In [12]:
mylist=[]
for tokens in query_counter.keys() & document_counter.keys():
    mylist.append(query_counter[tokens]*document_counter[tokens])

In [13]:
mylist

[1, 1]

In [14]:
dot_prod = sum(mylist)

In [15]:
query_magnitude = math.sqrt(sum(query_counter[token]**2 for token in query_counter))

In [16]:
query_magnitude

3.3166247903554

In [18]:
document_magnitude = math.sqrt(sum(document_counter[token] ** 2 for token in document_counter))

In [19]:
document_magnitude

3.605551275463989

In [20]:
similarity=(dot_prod)/(query_magnitude*document_magnitude)

In [21]:
similarity

0.16724840200141816

### RAG Implementation function

Cosine Similarity

In [22]:
def cosine_similarity(query, document):
    # Tokenize and convert to lowercase
    query_tokens = query.lower().split(" ")
    document_tokens = document.lower().split(" ")

    # Create Counters for query and document
    query_counter = Counter(query_tokens)
    document_counter = Counter(document_tokens)

    # Calculate dot product
    dot_product = sum(query_counter[token] * document_counter[token] for token in query_counter.keys() & document_counter.keys())

    # Calculate magnitudes
    query_magnitude = math.sqrt(sum(query_counter[token] ** 2 for token in query_counter))
    document_magnitude = math.sqrt(sum(document_counter[token] ** 2 for token in document_counter))

    # Calculate cosine similarity
    similarity = dot_product / (query_magnitude * document_magnitude) if query_magnitude * document_magnitude != 0 else 0

    return similarity

In [23]:
user_query="is yoga good for health"
document="yoga is very good for living healthy lifesytle."

In [24]:
cosine_similarity(user_query,document)

0.6324555320336759

### Using corpus of documents to implement RAG and retrieving the most relevant document for the given query

In [25]:
corpus_of_documents = [
    "Take a leisurely walk in the park and enjoy the fresh air.",
    "Visit a local museum and discover something new.",
    "Attend a live music concert and feel the rhythm.",
    "Go for a hike and admire the natural scenery.",
    "Have a picnic with friends and share some laughs.",
    "Explore a new cuisine by dining at an ethnic restaurant.",
    "Take a yoga class and stretch your body and mind.",
    "Join a local sports league and enjoy some friendly competition.",
    "Attend a workshop or lecture on a topic you're interested in.",
    "Visit an amusement park and ride the roller coasters."
]

In [26]:
def return_response(query, corpus):
    similarities = []
    for doc in corpus:
        similarity = cosine_similarity(query, doc)
        similarities.append(similarity)
    return corpus[similarities.index(max(similarities))]

In [None]:
# Example 1
user_input="i like fresh air."

In [29]:
relevant_document=return_response(user_input,corpus_of_documents)

In [30]:
relevant_document

'Take a leisurely walk in the park and enjoy the fresh air.'

In [31]:
# Example 2
user_input="i like to do yoga"

In [32]:
relevant_document=return_response(user_input,corpus_of_documents)

In [33]:
relevant_document

'Take a yoga class and stretch your body and mind.'

### Using local llm with the help of Ollama-llama2

In [37]:
import requests
import json
full_response = []

In [41]:
prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.
This is the recommended activity: {relevant_document}
The user input is: {user_input}
Compile a recommendation to the user based on the recommended activity and the user input.
"""

url = 'http://localhost:11434/api/generate'

data = {
    "model": "llama2",
    "prompt": prompt.format(user_input=user_input, relevant_document=relevant_document)
}

headers = {
    "Content-Type": "application/json"
}

response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)

try:
    for line in response.iter_lines():
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            # print(decoded_line['response'])
            full_response.append(decoded_line['response'])

finally:
    response.close()

In [42]:
print(''.join(full_response))

 Great! Here's my recommendation based on your input:

"Perfect! You'll enjoy our restorative yoga class on Saturday at 10am. Sign up now!" Great! Based on your input, I recommend taking a yoga class to help you relax and stretch your body and mind. It's a great way to improve flexibility, balance, and overall well-being. Try it out and see how you like it! Great! Based on your interest in yoga, I recommend trying out a new yoga studio in your area. Not only will you get to stretch your body, but also improve your mental well-being.
