In [1]:
print("ok")

ok


In [2]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [3]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [6]:
extracted_data = load_pdf("C://Users//Shreyash Verma//AYURVEDA_CHATBOT//data")

In [7]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [8]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 1608


In [9]:
text_chunks[0]

Document(page_content='ALSO BY THE AUTHOR\nAyurveda Cooking for Self-Healing\n (with Usha Lad)\nThe Yoga of Herbs\n (with David Frawley)\nAyurveda: The Science of Self-Healing\nSecrets of the Pulse', metadata={'source': 'C:\\Users\\Shreyash Verma\\AYURVEDA_CHATBOT\\data\\The Complete Book of Ayurvedic Home Remedies.pdf', 'page': 1})

In [10]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [11]:
embeddings = download_hugging_face_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [13]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [14]:
from pinecone import Pinecone

pc = Pinecone(api_key="api_key_of pineconea")
index = pc.Index("ayurveda")

In [15]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [16]:
import time

index_name = 'ayurveda'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,  # dimensionality of ada 002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1608}},
 'total_vector_count': 1608}

In [17]:
from tqdm.auto import tqdm

In [18]:
batch_size = 100

for i in tqdm(range(0, len(text_chunks), batch_size)):
    i_end = min(len(text_chunks), i + batch_size)
    # get batch of data
    batch = text_chunks[i:i_end]
    
    # generate unique ids for each chunk
    ids = [f"{x.metadata['source']}-{x.metadata['page']}-{i}" for i, x in enumerate(batch)]
    
    # get text to embed
    texts = [x.page_content for x in batch]
    
    # embed text
    embeds = embeddings.embed_documents(texts)
    
    # get metadata to store in Pinecone
    metadata = [{'text': x.page_content, 'source': x.metadata['source'], 'page': x.metadata['page']} for x in batch]
    
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

print("Data uploaded to Pinecone successfully.")

100%|██████████| 17/17 [01:04<00:00,  3.81s/it]

Data uploaded to Pinecone successfully.





In [19]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3216}},
 'total_vector_count': 3216}

Retrieval Augmented Generation

In [20]:
from langchain.vectorstores import Pinecone

text_field = "texts"


vectorstore = Pinecone(
    index=index,  
    embedding_function=embeddings.embed_query, 
    text_key=text_field  
)


In [21]:
query_text = "What is ayurveda ?"

# Embed the query text to get the query vector
query_vector = embeddings.embed_query(query_text)

# Perform the similarity search using Pinecone's query method
response = index.query(
    vector=query_vector,  # Use the query vector
    top_k=3,  # Number of top results to return
    include_values=True,  # Include the vector values in the results
    include_metadata=True  # Include metadata in the results
)

In [22]:
response.matches


[{'id': 'data\\The Complete Book of Ayurvedic Home Remedies.pdf-9-16',
  'metadata': {'page': 9.0,
               'source': 'data\\The Complete Book of Ayurvedic Home '
                         'Remedies.pdf',
               'text': 'purpose of this book is to acquaint you with these '
                       'natural methods, so you can make the\n'
                       'lifestyle choices and learn the self-healing modalities '
                       'that are right for you in order to\n'
                       'create, maintain, or restore health and balance.\n'
                       'Ayurveda\n'
                       ' is a Sanskrit word that means “the science of life and '
                       'longevity.” According\n'
                       'to this science, every individual is both a creation of '
                       'cosmic energies and a unique\n'
                       'phenomenon, a unique personality. Ayurveda teaches that '
                       'we all have a cons

In [23]:
for i, match in enumerate(response['matches']):
    print(f"Result {i + 1}:")
    print(f"Text: {match['metadata']['text']}")
    print(f"Source: {match['metadata']['source']}")
    print(f"Page: {match['metadata'].get('page', 'N/A')}")
    print()

Result 1:
Text: purpose of this book is to acquaint you with these natural methods, so you can make the
lifestyle choices and learn the self-healing modalities that are right for you in order to
create, maintain, or restore health and balance.
Ayurveda
 is a Sanskrit word that means “the science of life and longevity.” According
to this science, every individual is both a creation of cosmic energies and a unique
phenomenon, a unique personality. Ayurveda teaches that we all have a constitution,
Source: data\The Complete Book of Ayurvedic Home Remedies.pdf
Page: 9.0

Result 2:
Text: purpose of this book is to acquaint you with these natural methods, so you can make the
lifestyle choices and learn the self-healing modalities that are right for you in order to
create, maintain, or restore health and balance.
Ayurveda
 is a Sanskrit word that means “the science of life and longevity.” According
to this science, every individual is both a creation of cosmic energies and a unique
phenomenon,

In [24]:
llm=CTransformers(model="C://Users//Shreyash Verma//AYURVEDA_CHATBOT//model//llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [39]:
query_text = input("Enter your query: ")
query_vector = embeddings.embed_query(query_text)
response = index.query(
    vector=query_vector,
    top_k=2,
    include_values=True,
    include_metadata=True
)
similar_texts = [match['metadata']['text'] for match in response['matches']]

augmented_prompt = f"""
Context:
{similar_texts[0]} {similar_texts[1]}  # Use only the top 2 responses to keep it concise

Question:
{query_text}

Please answer based on the context above. If you do not know the answer, respond with "I don't know."
"""

# Step 5: Generate a response using the language model
output = llm(augmented_prompt)

# Step 6: Display the generated response
print("Generated response:")
print(output)

Generated response:

Answer 1: Try using some Ayurvedic herbal soap, such as neem. Let some of the oil remain on your skin. The ancient Ayurvedic textbooks recommend rubbing some chickpea our over the skin to absorb and help remove the oil. This works very well to remove the oil, but it is more suited to a culture in which individuals bathe outdoors. Today, if you use chickpea our, be aware that oil, our, and hot water combine into a formidable mass that can cause skin irritation."
Answer 2: For skin irritation, try using some aloe vera gel on the affected area. It has anti-inflammatory properties that can help soothe and calm the skin. You can also use coconut oil or olive oil to moisturize the skin and reduce inflammend irrigobtain its natural ingreduce inflate excessively help it will help reduce any other creams for relief irrituate dry out some tea tree tea tree turmerchelpreease the aloe.
