In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
import os
os.chdir("../")

In [4]:
#extract data from the PDF file

def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader
    )
    documents = loader.load()

    return documents

In [5]:
extracted_data = load_pdf_file(data='Data/')

In [6]:
#split the data into text chunks

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [7]:
text_chunks = text_split(extracted_data)
print("length of the chunks",len(text_chunks))

length of the chunks 6973


In [8]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [9]:
#download the embedding from hugging face

def download_hugging_face_embeddings():
    embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


#### the model has 384 dimensions

In [11]:
query_result = embeddings.embed_query("hello world")
print(len(query_result))

384


In [19]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-chatbot"

pc.create_index(
    name = index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)




{
    "name": "medical-chatbot",
    "metric": "cosine",
    "host": "medical-chatbot-8x7n57s.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [13]:
import os
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

In [14]:
import os
os.environ["PINE_CONE_API_KEY"] = PINECONE_API_KEY

In [17]:
index_name = "medical-chatbot"

In [20]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name = index_name,
    embedding=embeddings
)

In [21]:
#load existing index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [22]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1bf186f9b70>

In [23]:
retriever = docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [24]:
retrieved_docs = retriever.invoke("what is Acne")

In [25]:
retrieved_docs

[Document(id='3ffbb2fb-da4a-4d83-865a-9990803ae43a', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 423.0, 'page_label': '424', 'producer': 'GPL Ghostscript 9.10', 'source': 'Data\\medical_book.pdf', 'subject': '', 'title': '', 'total_pages': 759.0}, page_content='thing that irritates the skin and is manifested by one or\nmore lines of red, swollen, blistered skin that may itch or\nGALE ENCYCLOPEDIA OF MEDICINE 21036\nDermatitis'),
 Document(id='fcc3b6f0-435e-475c-945a-c4115bd00788', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 297.0, 'page_label': '298', 'producer': 'GPL Ghostscript 9.10', 'source': 'Data\\medical_book.pdf', 'subject': '', 'title': '', 'total_pages': 759.0}, page_content='repeated exposure to an allergen (an allergy-causing sub-\nstance) triggers an immune response th

## LLM integration

In [26]:
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

In [31]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [42]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0.3)

In [43]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = """
You are an expert medical assistant. Your primary task is to provide accurate and helpful information based on the medical text provided as context.

**Instructions:**
1.  Analyze the user's question to determine if they are describing symptoms or asking for information about a specific medical condition.
2.  **If the user describes symptoms:** Use the provided context to identify the most likely disease or condition. Clearly state the condition and provide a summary based *only* on the retrieved context. Include any relevant information about general treatment or management mentioned in the text.
3.  **If the user asks about a specific condition:** Use the context to provide a detailed explanation of that condition.
4.  **If the context is insufficient:** If the provided text does not contain relevant information to answer the question, you must state that you cannot find the answer in your available data. Do not use outside knowledge or make up information.
5.  **Mandatory Disclaimer:** Always conclude every response with the following disclaimer on a new line:
    "Disclaimer: This is for informational purposes only. Consult a qualified medical professional for any health concerns."

**Context:**
{context}
"""

# --- 4. Create the RAG Chain with the Desired Structure ---
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [47]:
response = rag_chain.invoke({"input":"what is acne"})
print(response["answer"])

I cannot find the answer to your question in the provided text.  

Disclaimer: This is for informational purposes only. Consult a qualified medical professional for any health concerns.


In [53]:
response = rag_chain.invoke({"input":"what diseases are diagnosed with cystoscopy"})
print(response["answer"])

Based on the provided text, cystoscopy can detect inflammation of the bladder lining, prostatic enlargement, and tumors.  Further evaluation or biopsies may be needed depending on the findings, and some tumors may be removed during the procedure.

Disclaimer: This is for informational purposes only. Consult a qualified medical professional for any health concerns.
