<a href="https://colab.research.google.com/github/Shanvithegreat0/Localbuddy/blob/main/RAG_with_Gemini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### <a id='toc1_1_1_'></a>[Importing libraries](#toc0_)

In [None]:
!pip install google-generativeai==0.3.2
!pip install chromadb
!pip install pandas
!pip install PyPDF2
!pip install python-dotenv



In [None]:
import os
from dotenv import load_dotenv
from pprint import pprint

import pandas as pd

import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings

import google.generativeai as genai

from IPython.display import Markdown

In [None]:
genai.__version__

'0.3.2'

In [None]:
load_dotenv()

api_key = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=api_key)

In [None]:
for m in genai.list_models():
    if 'embedContent' in m.supported_generation_methods:
        print(m.name)

models/embedding-001
models/text-embedding-004


In [None]:
import json

with open('/content/output.json') as f:
    data = json.load(f)

In [None]:
pprint(data[0])

{'input': 'Can you List all the Restraunts in Jankipuram?',
 'output': 'New Lucknow Kitchen, Kalika Dhaba, Jamghat Restaurant, Bajpai '
           'Caters and Bhojnalaya, The Tarrace Restaurant, The Kitchen, Radhey '
           'Shyam Restaurant, Momo Magic Cafe Lucknow, Zayka Veg Corner'}


We try to take each block and convert it into a single string which concatenates the 3 values for the 3 keys.

In [None]:
documents = []

for item in data:
    entry = ""

    if item['input'] != '':
        entry += f"Input : {item['input']}\n"

    if item['output'] != '':
        entry += f"Output : {item['output']}"

    documents.append(entry)

len(documents)

157

In [None]:
pprint(documents[20])

('Input : What facilities are available at Shubham Hospital?\n'
 'Output : All test facilities.')


### <a id='toc1_1_4_'></a>[The embedding database](#toc0_)

In [None]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        model = 'models/embedding-001'
        # for better results, try to provide a title for each input if the corpus is covering a lot of domains
        title = "Systeme de management de l'environnement"

        return genai.embed_content(
            model=model,
            content=input,
            task_type="retrieval_document",
            title=title)["embedding"]

In [None]:
import time
from tqdm import tqdm

In [None]:
def create_chroma_db(documents, name):
    chroma_client = chromadb.PersistentClient(path="../database/")

    db = chroma_client.get_or_create_collection(
        name=name, embedding_function=GeminiEmbeddingFunction())

    initiali_size = db.count()
    for i, d in tqdm(enumerate(documents), total=len(documents), desc="Creating Chroma DB"):
        db.add(
            documents=d,
            ids=str(i + initiali_size)
        )
        time.sleep(0.5)
    return db


def get_chroma_db(name):
    chroma_client = chromadb.PersistentClient(path="../database/")
    return chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

In [None]:
db = create_chroma_db(documents, "sme_db")
db.count()

Creating Chroma DB: 100%|██████████| 157/157 [05:57<00:00,  2.28s/it]


157

Let's see if the database contains anything

In [None]:
pd.DataFrame(db.peek(5))

Unnamed: 0,ids,embeddings,metadatas,documents,uris,data
0,0,"[0.03514762967824936, -0.013021623715758324, -...",,Input : Can you List all the Restraunts in Jan...,,
1,1,"[0.07599864155054092, -0.04383125528693199, -0...",,Input : What is the specialty of Global Hospit...,,
2,10,"[0.04352160170674324, -0.03732515126466751, -0...",,Input : Who is personally known by the reviewe...,,
3,100,"[0.067727715, -0.03299514, -0.05731334, -0.012...",,Input : Where can one find the best lassi and ...,,
4,101,"[0.08850904, -0.05757119, -0.059816696, 0.0017...",,Input : Which restaurant is known for its affo...,,


The document is embedded into a vector with 768 dimensions

In [None]:
len(pd.DataFrame(db.peek(5)).iloc[0]["embeddings"])

768

In [None]:
def get_relevant_passages(query, db, n_results=5):
    passages = db.query(query_texts=[query], n_results=n_results)[
        'documents'][0]
    return passages

In [None]:
question = "Give a hospital in jankipuram"
passages = get_relevant_passages(question, db, n_results=2)

Markdown(passages[0])

Input : Which hospital is praised for its excellent facilities?
Output : Hope Hospital Jankipuram is praised for its excellent facilities.

### <a id='toc1_1_6_'></a>[Prompting the Gemini model](#toc0_)

In [None]:
def make_prompt(query, relevant_passage):
    escaped = relevant_passage.replace("'", "").replace('"', "")
    # prompt = f"""question : {query}.\n
    # Your answer :
    # """

    prompt = f"""question : {query}.\n
    Additional Information:\n {escaped}\n
    If you find the question unrelated to the additional information, you can ignore it and respond with 'OUT OF CONTEXT'.\n
    Your answer :
    """

    # prompt = f"""question : {query}.\n
    # Additional Information:\n {escaped}\n
    # If you find the question unrelated to the additional information, you can ignore it and respond with 'OUT OF CONTEXT' if the question is out of context in the first place, and then answer the question even if it is out of context by clarifying to the user that this response is unrelated to the context.\n
    # Your answer :
    # """

    # prompt = f"""The questions that will be asked are related to the environmental management system. Here is the question: {query}.\nTry to answer the question using the following additional information, which may help you answer the question.\nAdditional Information:\n {escaped}
    # Your answer :
    # """

    return prompt


In [None]:
def convert_pasages_to_list(passages):
    context = ""

    for passage in passages:
        context += passage + "\n"

    return context

In [None]:
prompt = make_prompt(question, convert_pasages_to_list(passages))
Markdown(prompt)

question : Give a hospital in jankipuram.

    Additional Information:
 Input : Which hospital is praised for its excellent facilities?
Output : Hope Hospital Jankipuram is praised for its excellent facilities.
Input : What is praised about Hope Hospital Jankipuram?
Output : The hospital is praised for its nice hospitality and experienced doctors.


    If you find the question unrelated to the additional information, you can ignore it and respond with 'OUT OF CONTEXT'.

    Your answer :
    

In [None]:
model = genai.GenerativeModel('gemini-pro')

#### <a id='toc1_1_7_2_'></a>[Prompting the model](#toc0_)

In [None]:
answer = model.generate_content(prompt)
Markdown(answer.text)

OUT OF CONTEXT

### <a id='toc1_1_8_'></a>[The pipeline](#toc0_)

In [None]:
# Step 1
# question = "Donne-moi le nombre de planetes dans le systeme solaire"
question = "top-rated pediatrician"

# Step 2
db = get_chroma_db("sme_db")
passages = get_relevant_passages(question, db, n_results=2)

# Step 3
context = convert_pasages_to_list(passages)

# Step 4
prompt = make_prompt(question, context)

# Step 5
model = genai.GenerativeModel('gemini-pro')
answer = model.generate_content(prompt)

# Step 6
Markdown(answer.text)

Though not named specifically, Medini Hospital is known for its experienced pediatricians.