## End-to-End Medical Chatbot using RAG and LLMs

In [1]:
#Checking current working directory
import os
%pwd

'f:\\GenAi_Project\\End-to-End_Medical_chatbot\\research'

In [2]:
##Change the current working directory to the project root
os.chdir('F:/GenAi_Project/End-to-End_Medical_chatbot')
#Checking the current working directory again
%pwd

'F:\\GenAi_Project\\End-to-End_Medical_chatbot'

In [21]:
#importing the necessary libraries
from langchain.document_loaders import PyPDFLoader , DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq

In [4]:
#Extract the text from the PDF file

def load_pdf(file_path):
    """
    Load a PDF file and extract its text content.
    
    Args:
        file_path (str): The path to the PDF file.
        
    Returns:
        str: The extracted text from the PDF.
    """
    loader = DirectoryLoader(
        file_path,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [5]:
#Extracted data from the PDF files 
extracted_data=load_pdf(file_path='Data/')

In [6]:
# Number of documents loaded
len(extracted_data)  

637

In [7]:
#Split the text into smaller chunks
def text_split(extracted_data):
    """
    Split the extracted text into smaller chunks.
    
    Args:
        extracted_data (list): List of documents with text content.
        
    Returns:
        list: List of text chunks.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    texts = text_splitter.split_documents(extracted_data)
    return texts

In [8]:
text_chunks = text_split(extracted_data)
# Number of text chunks created
len(text_chunks)

3426

In [9]:
#Downlaod the Embeddings from HuggingFace

def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
    return embeddings 

In [11]:
embeddings = download_hugging_face_embeddings()

In [12]:
# Load environment variables from .env file
from dotenv import load_dotenv
import os
load_dotenv()
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")

In [13]:
#Initialize Pin  econe
#Create a Pinecone index for storing the embeddings
from pinecone import Pinecone, ServerlessSpec
from pinecone.grpc import PineconeGRPC as grpc

pc= Pinecone(api_key=PINECONE_API_KEY)

index_name  = "medical-chatbot-index"

pc.create_index(
        name=index_name,
        dimension = 384,
        metric = "cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"

    ))


{
    "name": "medical-chatbot-index",
    "metric": "cosine",
    "host": "medical-chatbot-index-zqyq6px.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [14]:
# Embed each text chunk and store it in the Pinecone index
from langchain_pinecone import PineconeVectorStore

docsearach = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [15]:
#Load existing index
from langchain_pinecone import PineconeVectorStore
docsearach = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [16]:
docsearach

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2a03048a740>

In [17]:
#Converting the Pinecone index to a retriever
#This allows us to perform similarity searches on the indexed documents
retriever = docsearach.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 5,  # Number of similar documents to retrieve
    }
)

In [18]:
retrived_docs = docsearach.similarity_search("What is the treatment for diabetes?")
retrived_docs

[Document(id='db7ec59b-e4b9-4800-888f-8dae15b7a40e', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 275.0, 'page_label': '276', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='ble for glucose utilization. It is effective in both types of\ndiabetes, since, even in insulin resistance, some sensitivi-\nty remains and the condition can be treated with larger\ndoses of insulin. Most insulins are now produced by\nrecombinant DNA techniques, and are chemically identi-\ncal to natural human insulin. Isophane insulin suspen-\nsion, insulin zinc suspension, and other formulations are\nintended to extend the duration of action of insulin, and\npermit glucose control over longer periods of time.\nSulfonylureas (chlorpropamide [Diabinese], tolaza-\nmide [Tolinase], glipizide [Glucotrol] and others) act by\nincreasing insulin release from the beta cells of the 

In [22]:
llm = ChatGroq(groq_api_key=GROQ_API_KEY,model_name='gemma2-9b-it')

In [23]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [24]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [25]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly is a disorder caused by the abnormal release of growth hormone from the pituitary gland.  This leads to excessive growth of bones and soft tissues, as well as other body disturbances.  Gigantism is a variant of acromegaly that occurs in children before their growth plates close. 



