# Loading PDF file

In [1]:
import os
from tqdm import tqdm
os.environ["GEMINI_API_KEY"]="AIzaSyCusIoE9cMg-vVF1VIK25an5ntkiA-JTBI"

In [2]:
from IPython.display import Markdown
import textwrap
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [3]:
# all the names of the files in the directory ./Ayurveda Dataset/ayurveda_texts
def files_count(path):
    return len (os.listdir(path))

filesCount = files_count("./Ayurveda Dataset/ayurveda_texts")
to_markdown(f'There are `{filesCount}` files in the directory ./Ayurveda Dataset')

> There are `2167` files in the directory ./Ayurveda Dataset

In [4]:
# laoding all the data from files the directory ./Ayurveda Dataset/ayurveda_texts
data = []
path = "./Ayurveda Dataset/ayurveda_texts"
count = 1
for file in tqdm(os.listdir(path), desc="Loading Data", total=len(os.listdir(path))):
    with open(f'{path}/{file}', 'r') as f:
        data.append(f.read())

Loading Data:   0%|          | 0/2167 [00:00<?, ?it/s]

Loading Data: 100%|██████████| 2167/2167 [00:00<00:00, 5669.57it/s]


In [5]:
to_markdown(f"total files are : `{len(data)}`")

> total files are : `2167`

#### we will be using only 100 files

In [6]:
# extracting the text
ayurText = ""
for file in tqdm(data, desc="Merging Data", total=100, ascii=False):
    ayurText += file
to_markdown(f'The total number of characters in the dataset is `{len(ayurText)}`')

Merging Data: 2167it [00:40, 53.26it/s]                        


> The total number of characters in the dataset is `52966662`

# splitting the text into paragraphs --filtering out the empty paragraphs

In [7]:
import re
def split_text(text: str):
    split_text = re.split('\n\n\n\n', text)
    return [i for i in split_text if i != ""]

paragraphs = split_text(ayurText)
to_markdown(f'The total number of paragraphs in the dataset is `{len(paragraphs)}`')


> The total number of paragraphs in the dataset is `129794`

In [8]:
paragraphs

['ILLUSTRATED \n\nAstariga Hrdaya \n\nTEXT WITH ENGLISH TRANSLATION AND APPENDICES ',
 '\nForeword by \nProf. R. H. SINGH ',
 'The \n\nCHAUKHAMBA AYURVEDA STUDIES \n15 «- ',
 '\nIllustrated \n\nAstanga Hrdaya \n\nof Vagbhata \nSUTRA-STHANA \n\nText with English Translation \n\n# \n\nincluding \nMAULIKA SIDDHANTA \n\n[as per CCIM Syllabus 2012] \n\nby \n\nDr. R. VIDYANATH \n\nMD (Ayu); PhD \nProfessor & HOD \nP G Dept. of Ayurveda Samhita \nDr. B R K R Govt. Ayurvedic College \nHYDERABAD-500038 (A.P.) \n\nforeword by \n\nProf. R.H. SINGH ',
 '\nChaukhamba Surbharati Prakashan \n\nVara nasi ',
 '© All right reserved. No part of this publication may be reproduced or transmitted in any form or by an> \nmeans, electronic or mechanical, including photocopying, recording or any information storage or retrieval \nsystem, without prior permission in writing from the Publisher. ',
 'Illustrated Astaiiga Hrdaya \nISBN : 978-93-82443-86-5 ',
 'Published by : \n\nCHAUKHAMBA SURBHARATI PRAKASHAN \n\

In [9]:
# split the data into 2
num = 129794 // 2
paragraphsActual = paragraphs[:num]
to_markdown(f"Actual = `{len(paragraphsActual)}`")

> Actual = `64897`

# Splitting the text --no filter

In [10]:
#  splitting text into paragraphs
def split_text(text):
    return text.split('\n\n\n\n')

notTpBeUsed = split_text(ayurText)
to_markdown(f'The total number of paragraphs in the dataset is `{len(notTpBeUsed)}` not to be used')


> The total number of paragraphs in the dataset is `257159` not to be used

# Embedding the text

In [11]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os

class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]


# Storing vectors into DB

In [12]:
import chromadb
import time
def create_chroma_db(documents, path, name):
    count = 0
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        start = time.time()
        count += 1
        print(f'Adding paragraphs {count} to the database {len(documents)-count} remaining')
        db.add(documents=d, ids=str(i))
        stop = time.time()
        print(f'Time taken to add paragraph {count} is {stop-start} seconds')
        print(f"Estimated time remaining: \n\t{((stop-start)*(len(documents)-count))/3600} hours \n\tor {((stop-start)*(len(documents)-count))/60} minutes \n\tor {((stop-start)*(len(documents)-count))} seconds")
    return db, name

In [13]:
db,name =create_chroma_db(documents=paragraphsActual, path="./RAG/contents2", name="rag_experiment2")

Adding paragraphs 1 to the database 64896 remaining
Time taken to add paragraph 1 is 2.117119312286377 seconds
Estimated time remaining: 
	38.16460413614909 hours 
	or 2289.8762481689455 minutes 
	or 137392.57489013672 seconds
Adding paragraphs 2 to the database 64895 remaining
Time taken to add paragraph 2 is 1.9196741580963135 seconds
Estimated time remaining: 
	34.60479291379452 hours 
	or 2076.287574827671 minutes 
	or 124577.25448966026 seconds
Adding paragraphs 3 to the database 64894 remaining
Time taken to add paragraph 3 is 2.0040388107299805 seconds
Estimated time remaining: 
	36.1250262731976 hours 
	or 2167.501576391856 minutes 
	or 130050.09458351135 seconds
Adding paragraphs 4 to the database 64893 remaining
Time taken to add paragraph 4 is 1.8953499794006348 seconds
Estimated time remaining: 
	34.16526283701261 hours 
	or 2049.9157702207567 minutes 
	or 122994.94621324539 seconds
Adding paragraphs 5 to the database 64892 remaining
Time taken to add paragraph 5 is 2.02201

In [2]:
def load_chroma_collection(path, name):
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db


In [16]:
db=path=load_chroma_collection(path="./RAG/contents", name="rag_experiment2")

# Retrieval

In [23]:
paragraphs[32:]

['First Edition : 2013 \n? 500.00 ',
 'Distributor : ',
 'CHAUKHAMBA PUBLISHING HOUSE ',
 '4697/2, Ground Floor, Street No. 21-A \nAnsari Road, Daryaganj \nNew Delhi 110002 ',
 'Tel : +91-11-32996391, +91-11-23286537 ',
 'e-mail : chaukhambapublishinghouse@gmail.com ',
 '♦ ',
 'Also can be had from : ',
 'CHAUKHAMBA SANSKRIT PRATISHTHAN ',
 '38 U. A. Bunglow Road, Jawahar Nagar \nPost Box No. 2113 \nDelhi 110007 ',
 '• ',
 'CHOWKHAMBA VIDYABHAWAN ',
 'Chowk (Behind Bank of Baroda Building) \nPost Box No. 1069 \nVaranasi 221001 ',
 'DEDICATION ',
 '\nMy Beloved Father \nLate Dr. R. SATYANARAYANACHARYULU ',
 'Ayurveda Vaidya Vidwan ',
 '\nbe disease free ',
 'One, w/70 a/ways resorts to desirable food and regimen, is \nobjective, uninterested to sensual affairs, generous, straight \nforward, honest, having patience and who values traditional \nwisdom will never be affected by diseases. ',
 '-Vagbhata Sutra (4:36) ',
 '< ',
 'Foreword ',
 "Ragadirogan satatanusaktanasesakayaprasrtanasesan

In [17]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
  return passage

In [18]:
relevant_text = get_relevant_passage("obesity",db,50)

Number of requested results 50 is greater than number of elements in index 49, updating n_results = 49


In [19]:
relevant_text

['    ',
 '      Skip to main content',
 'Academia.edu no longer supports Internet Explorer.To browse Academia.edu and the wider internet faster and more securely, please take a few seconds to\xa0upgrade your browser.',
 'Need an account?\xa0Click here to sign up',
 "AboutPressBlogPeoplePapersTopicsAcademia.edu Publishing\xa0We're Hiring!\xa0Help CenterFind new research papers in:PhysicsChemistryBiologyHealth SciencesEcologyEarth SciencesCognitive ScienceMathematicsComputer ScienceTermsPrivacyCopyrightAcademia ©2024",
 "Log InSign UpLog InSign Upmore\xa0AboutPressBlogPeoplePapersTermsPrivacyCopyright\xa0We're Hiring!\xa0Help Centerless\xa0",
 'Yashowanta N. Mohapatra | IIT Kanpur - Academia.edu',
 "×CloseLog InLog in with FacebookLog in with GoogleorEmailPasswordRemember me on this computeror reset passwordEnter the email address you signed up with and we'll email you a reset link.",
 'Disentangling degradation and auto-recovery of luminescence in Alq3 based organic light emitting diod

In [25]:
system_prompt = '''<s>[INST] <<SYS>>\
You are Myatri, an AI specialized in Ayurvedic health advice and an
Ayurvedic practitioner. Introduce your self as a personal Ayurvedic Assistant.\
Based on your illness or symptoms or the prompt given by the user, you will provide a ayurvedic solution to the problem , 
as well as the dosage, composition of the medication, instructions on how to take it, precautions, and additional tips. Here's the format of my response:\
- How the medication will help: [Explanation of how the medication will aid in healing]\
- Herbs: [List of herbs/ingridents included in the medication]\
- How to make the medicine at home: [Instructions on how to prepare the medicine with precise measurements]\
- Precautions: [Any precautions to be aware of while taking the medication]\
- Tips: [Additional tips for managing the illness or enhancing the effectiveness of the medication]\
Please note that while I strive to provide a human-like interaction, I won't use human gestures such as winks, smiling, nods, adjusts glasses, etc.\
<</SYS>>"'''

# Generation

In [26]:
def make_rag_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

  ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [27]:
import google.generativeai as genai
def generate_response(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel(
        model_name="models/gemini-1.5-pro-latest",
        system_instruction=system_prompt,
    )
    answer = model.generate_content(prompt)
    return answer.text

# Bringing it all together

In [28]:

def generate_answer(db,query):
    #retrieve top 3 relevant text chunks
    relevant_text = get_relevant_passage(query,db,n_results=3)
    prompt = make_rag_prompt(query, 
                             relevant_passage="".join(relevant_text)) # joining the relevant chunks to create a single passage
    answer = generate_response(prompt)

    return answer
    
    
    

In [1]:
db=load_chroma_collection(path="./RAG/contents", #replace with path of your persistent directory
                          name="rag_experiment2222") #replace with the collection name

answer = generate_answer(db,query="I am experiencing aasthma & red eye can you help me")
to_markdown(answer)

NameError: name 'load_chroma_collection' is not defined

In [30]:
to_markdown(answer)

> Namaste, I am Myatri, your personal Ayurvedic assistant. I understand you are experiencing a cough and headache. Let's address these concerns with Ayurvedic remedies. 
> 
> **How the medication will help:** This herbal concoction is designed to soothe your throat, clear congestion, and alleviate headaches caused by aggravated Kapha dosha. 
> 
> **Herbs:**
> 
> *   **Ginger (Shunthi):** 1 inch piece
> *   **Tulsi (Holy Basil):** 5-6 fresh leaves 
> *   **Black Pepper (Maricha):** 3-4 crushed peppercorns
> *   **Honey (Madhu):** 1 teaspoon
> 
> **How to make the medicine at home:**
> 
> 1.  Wash the ginger and Tulsi leaves thoroughly. 
> 2.  Crush the ginger and peppercorns lightly.
> 3.  Boil two cups of water in a saucepan. 
> 4.  Add the ginger, Tulsi leaves, and crushed peppercorns to the boiling water.
> 5.  Let it simmer for 5-7 minutes.
> 6.  Strain the decoction into a cup. 
> 7.  Allow it to cool slightly, then mix in the honey.
> 
> **Dosage:** Sip this herbal tea warm, 2-3 times a day.
> 
> **Precautions:**
> 
> *   Avoid using honey for infants under 12 months old.
> *   If you are allergic to any of the ingredients, refrain from using this remedy. 
> *   Consult with your doctor if symptoms persist for more than a week.
> 
> **Tips:**
> 
> *   Incorporate turmeric milk (warm milk with 1/2 tsp turmeric) into your diet for added relief.
> *   Practice steam inhalation with eucalyptus oil to ease congestion. 
> *   Ensure adequate rest and hydration.
> 
> Remember, this advice is supplementary and not a replacement for a doctor's consultation. If your symptoms worsen or you have underlying health conditions, seeking professional medical advice is crucial.  
