In [1]:
import os
os.chdir('../')

In [76]:
import chromadb
from langchain.vectorstores import Chroma

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
def load_pdf_file(data):
    loader= DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [5]:
extracted_data=load_pdf_file(data='data/')

In [6]:
def split_text_data(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [7]:
text_chunks=split_text_data(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 27045


In [8]:
from langchain.embeddings import HuggingFaceEmbeddings

In [9]:
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [77]:
chroma_client = chromadb.PersistentClient(path="./chroma_db")
vector_store = Chroma.from_documents(text_chunks, embeddings, persist_directory="./chroma_db")
vector_store.persist()
print("ChromaDB successfully stored the embeddings locally.")

ChromaDB successfully stored the embeddings locally.


  vector_store.persist()


In [89]:
chroma_client = chromadb.PersistentClient(path="./chroma_db")
retriever = Chroma(persist_directory="./chroma_db", embedding_function=embeddings).as_retriever()

In [90]:
from langchain_ollama.llms import OllamaLLM
llm = OllamaLLM(model="llama3")

In [91]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [92]:
system_prompt = (
    """
    You are a professional medical chatbot designed to assist users by answering queries based on the provided medical PDF document. 
    Your responses must be strictly derived from the contents of the document, ensuring accuracy, clarity, and compliance with medical 
    guidelines. If the document does not contain the requested information, politely inform the user. Avoid making assumptions or 
    providing medical advice beyond the document's scope.
    {context}
    """
)

In [93]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [94]:
qa_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, qa_chain)

In [95]:
response = rag_chain.invoke({"input": "what is immunoglobulin?"})
print(response)

{'input': 'what is immunoglobulin?', 'context': [Document(metadata={'page': 138, 'page_label': '109', 'source': 'data\\medical_book.pdf', 'text': 'the possible depletion of one or more types of blood\nKEY TERMS\nAlkylating agent— Achemicalthataltersthecompo-\nsition of the genetic material of rapidly dividing cells,\nsuch as cancer cells, causing selective cell death;\nused as a chemotherapeutic agent to treat B-CLL.\nAntibody— A protective protein made by the\nimmune system in response to an antigen, also\ncalled an immunoglobulin.\nAutoimmune— An immune reaction of a patient\nagainst their own cells.\nHumanization— Fusing the constant and variable\nframework region of one or more human immuno-\nglobulins with the binding region of an animal\nimmunoglobulin, done to reduce human reaction\nagainst the fusion antibody.\nMonoclonal— Genetically engineered antibodies\nspecific for one antigen.\nTumor lysis syndrome— A side effect of some immu-'}, page_content='the possible depletion of on

In [96]:
response.keys()

dict_keys(['input', 'context', 'answer'])

In [98]:
print(response["answer"])

According to the provided medical PDF document, Immunoglobulin (also known as Antibody) is a protein molecule formed by mature B cells in response to foreign proteins in the body. There are five types of immunoglobulins, but the major one is gamma globulin or immunoglobin G. It's also mentioned that an antibody is a simple protein produced by the body to destroy bacteria, viruses, or other foreign bodies, and its production is triggered by a specific antigen.


In [99]:
response = rag_chain.invoke({"input": "what is data structures?"})
print(response["answer"])

I'm happy to help! However, I must inform you that the provided PDF document does not contain information about "data structures." The text primarily discusses genes, chromosomes, and DNA, which are related to genetics and biology. If you're looking for information on data structures in a computer science context, I'd be happy to assist you with that as well, but please note that it would require a different document or source of information.
