In [1]:
import chromadb
import sentence_transformers

In [2]:
# creating the client
client=chromadb.Client()
collection=client.create_collection("demo") # collection is storing our embeddings

# By default chroma used the sentence transformer all-MIniLM-L6-v2 model to creat embeddings

In [3]:
collection.add(
    documents=["this is a document about cat","this is a document about car"],
    metadatas=[{"category":"animal"},{"category":"vehicle"}],
    ids=["id1","id2"]
)# here adding the collection without mentioning any embedding mdoel , if we can't then it use its default embedding and fist convert our text into embeddings and then it stores it into the collection in chroma db

In [4]:
# lets query
results=collection.query(
    query_texts=['vehicle'],
    n_results=1
)
results

{'ids': [['id2']],
 'distances': [[0.8069301843643188]],
 'metadatas': [[{'category': 'vehicle'}]],
 'embeddings': None,
 'documents': [['this is a document about car']],
 'uris': None,
 'data': None}

In [3]:
import pypdf
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
#Extract data from the pdf
def load_pdf(data):
    loader=DirectoryLoader(data,
        glob="*.pdf",
        loader_cls=PyPDFLoader)
    doc=loader.load()
    return doc

In [5]:
#loading the data
extracted_data=load_pdf("data/")

In [6]:
len(extracted_data)

637

In [7]:
# coverting the corpus into chunka
#create a text chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks  

In [8]:
text_chunks=text_split(extracted_data)

In [9]:
len(text_chunks)

7020

In [10]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [11]:
batch_embedding_vectors = []

# Creating embeddings for each of the text chunks using the model
for i, t in enumerate(text_chunks):
    embedding = model.encode(t.page_content)
    embedding_vector = {'id': str(i), 'values': embedding.tolist()}

    # Include text content as metadata
    embedding_vector['metadata'] = {'text': t.page_content}

    batch_embedding_vectors.append(embedding_vector)


In [12]:
len(batch_embedding_vectors)

7020

In [13]:
from langchain_community.vectorstores import Chroma

# Assuming you have already created batch_embedding_vectors as shown before
texts = [vector['metadata']['text'] for vector in batch_embedding_vectors]
metadatas = [vector['metadata'] for vector in batch_embedding_vectors]
embeddings = [vector['values'] for vector in batch_embedding_vectors]
ids = [vector['id'] for vector in batch_embedding_vectors]

# Create Chroma vector store instance
db = Chroma()


In [14]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [15]:
from langchain_community.vectorstores import Chroma

In [16]:
# Create a custom embedding class
class CustomEmbeddingFunction:
    def __init__(self, text_to_embedding, model):
        self.text_to_embedding = text_to_embedding
        self.model = model  # Store the model for query embedding

    def embed_documents(self, texts):
        return [self.text_to_embedding[text] for text in texts]

    def embed_query(self, query):
        # Use the model to create an embedding for the query
        return self.model.encode(query).tolist()

# Assuming you have already created batch_embedding_vectors as shown before
texts = [vector['metadata']['text'] for vector in batch_embedding_vectors]
metadatas = [vector['metadata'] for vector in batch_embedding_vectors]

# Create a mapping from texts to their corresponding embeddings
text_to_embedding = {vector['metadata']['text']: vector['values'] for vector in batch_embedding_vectors}

# Assuming you used a SentenceTransformer model to create embeddings
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Create an instance of the custom embedding class
embedding_function = CustomEmbeddingFunction(text_to_embedding, model)

# Create Chroma vector store from texts
db = Chroma.from_texts(
    texts=texts,
    embedding=embedding_function,
    metadatas=metadatas
)

In [17]:
query="What are Allergies"
result=db.similarity_search(query)
result[0]

Document(page_content="GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies\nAllergic rhinitis is commonly triggered by\nexposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.\nThe presence of an allergen causes the\nbody's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.\nIgE molecules attach to mast\ncells, which contain histamine.HistaminePollen grains\nLymphocyte\nFIRST EXPOSURE", metadata={'text': "GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies\nAllergic rhinitis is commonly triggered by\nexposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.\nThe presence of an allergen causes the\nbody's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.\nIgE molecules attach to mast\ncells, which contain histami

In [18]:
query="What is Selective abortion"
result=db.similarity_search(query)
result[0]

Document(page_content='mother in carrying and giving birth to more than one ortwo babies, and also to decrease the risk of complicationsto the remaining fetus(es). The term selective abortionalso refers to choosing to abort a fetus for reasons such asthe woman is carrying a fetus which likely will be bornwith some birth defect or impairment, or because the sexof the fetus is not preferred by the individual.\nPurpose\nA woman may decide to abort for health reasons, for', metadata={'text': 'mother in carrying and giving birth to more than one ortwo babies, and also to decrease the risk of complicationsto the remaining fetus(es). The term selective abortionalso refers to choosing to abort a fetus for reasons such asthe woman is carrying a fetus which likely will be bornwith some birth defect or impairment, or because the sexof the fetus is not preferred by the individual.\nPurpose\nA woman may decide to abort for health reasons, for'})

In [19]:
query="what are the symptoms of cancer"
result=db.similarity_search(query)
result[0]

Document(page_content='• nausea and vomiting or sudden attacks of vomiting not\naccompanied by nausea\n• seizures• dizziness, loss of coordination or balance• personality changes• sudden loss of vision• memory loss• speech problems• sensory changes• mental impairment• weakness or paralysis on one side of the body\nA doctor should be notified whenever a patient expe-\nriences one or more of the symptoms.\nDiagnosis\nAlthough brain tumor symptoms resemble those of', metadata={'text': '• nausea and vomiting or sudden attacks of vomiting not\naccompanied by nausea\n• seizures• dizziness, loss of coordination or balance• personality changes• sudden loss of vision• memory loss• speech problems• sensory changes• mental impairment• weakness or paralysis on one side of the body\nA doctor should be notified whenever a patient expe-\nriences one or more of the symptoms.\nDiagnosis\nAlthough brain tumor symptoms resemble those of'})

In [20]:
# youtub tutorial by Pradip Nichite
"""from chromadb.config import Settings
client=chromadb.Client(Settings(
    chroma_db_impl="medical+chatbot",
    persist_directory="chatbot_db"))"""
#By this code it will create a directory with name of chatbot where it will store all embeddings

'from chromadb.config import Settings\nclient=chromadb.Client(Settings(\n    chroma_db_impl="medical+chatbot",\n    persist_directory="chatbot_db"))'

In [21]:
#for saving for chroma db file embedidng in zip file
#!zip -r "db name" "new name

In [22]:
# defining our prompt template
prompt_template="""
use the following pieces of information to answer the user's question
If you don't know the answer , just say that you don't know, don't try to make up an answer.

Context:{context}
Question:{question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""


In [23]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA 
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pypdf
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
import sentence_transformers
from langchain.chains.question_answering import load_qa_chain

In [24]:
PROMPT=PromptTemplate(template=prompt_template,input_variables=['context','question'])
chain_type_kwargs={'prompt':PROMPT}

In [25]:
#Loading our Llama model
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                 model_type="llama",
                 config={'max_new_tokens':512,
                         'temperature':0.8})

In [26]:
chain=load_qa_chain(llm,chain_type="stuff")

In [28]:
query = "What are the symptoms of cancer"
matching_doc = db.similarity_search(query)

# Adjust the invocation to pass a single dictionary argument
input_data = {
    "input_documents": matching_doc,
    "question": query
}

# Use invoke method with the correct input
answer = chain.invoke(input_data)
print(answer)


{'input_documents': [Document(page_content='• nausea and vomiting or sudden attacks of vomiting not\naccompanied by nausea\n• seizures• dizziness, loss of coordination or balance• personality changes• sudden loss of vision• memory loss• speech problems• sensory changes• mental impairment• weakness or paralysis on one side of the body\nA doctor should be notified whenever a patient expe-\nriences one or more of the symptoms.\nDiagnosis\nAlthough brain tumor symptoms resemble those of', metadata={'text': '• nausea and vomiting or sudden attacks of vomiting not\naccompanied by nausea\n• seizures• dizziness, loss of coordination or balance• personality changes• sudden loss of vision• memory loss• speech problems• sensory changes• mental impairment• weakness or paralysis on one side of the body\nA doctor should be notified whenever a patient expe-\nriences one or more of the symptoms.\nDiagnosis\nAlthough brain tumor symptoms resemble those of'}), Document(page_content='Most individuals wit

In [None]:
#doc=docsearch.similarity_search(query,k=3)

In [29]:
# now we need to create our question answering object
qa=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs)

In [30]:
'''qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)'''

'qa=RetrievalQA.from_chain_type(\n    llm=llm, \n    chain_type="stuff", \n    retriever=docsearch.as_retriever(search_kwargs={\'k\': 2}),\n    return_source_documents=True, \n    chain_type_kwargs=chain_type_kwargs)'

In [31]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])

Input Prompt: What is Selective abortion


  warn_deprecated(


Response :  Selective abortion, also known as selective selection reduction, selective reducselective reducselelective, selective pregnosis, selective reduction, selective reduc selectsreductionaltereotective reductions to decrease the termination of fetal reductusually referred to selective reducselective reduction, selective as elective for choose who aredualso reduction abortion, selective reductions.


KeyboardInterrupt: Interrupted by user