In [1]:
%pwd

'c:\\Users\\DHRUV\\OneDrive\\Desktop\\AI\\Medical-Chatbot-GenAI\\research'

In [2]:
import os 
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\DHRUV\\OneDrive\\Desktop\\AI\\Medical-Chatbot-GenAI'

In [32]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [33]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents


In [34]:
extracted_data=load_pdf_file(data='Data/')

In [35]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def smart_text_split(sections):
    final_chunks = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

    for sec in sections:
        # Split large section content into smaller chunks
        chunks = text_splitter.split_text(sec['content'])
        for i, chunk in enumerate(chunks):
            final_chunks.append({
                "disease": sec['disease'],
                "section": sec['section'],
                "content": chunk,
                "chunk_number": i
            })

    return final_chunks


In [36]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 36325


In [37]:
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings


In [38]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [39]:

embeddings = download_hugging_face_embeddings()

In [40]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [13]:
pip install pinecone

Note: you may need to restart the kernel to use updated packages.


In [41]:
from dotenv import load_dotenv
load_dotenv()

True

In [42]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY')

In [15]:

from pinecone import ServerlessSpec,Pinecone

PINECONE_API_KEY = "pcsk_iBqfZ_AYdXf4EBHJZeo6Q5jbvYavKPXjwQsnhVpqqK1nfxQYfVsp8CATGteC1Mg7tJzs"

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"

# Check if index already exists
if index_name not in pc.list_indexes().names():
    print(f"Creating index: {index_name}")
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print("Index created successfully ✅")
else:
    print(f"Index '{index_name}' already exists ✅")


Index 'medicalbot' already exists ✅


In [43]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [44]:
from langchain.vectorstores import Pinecone
from langchain_pinecone import PineconeVectorStore


In [56]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name
)


In [58]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)


In [59]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1dfe7eef9a0>

In [60]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [61]:
retrieved_docs = retriever.invoke("What is Acne?")

In [62]:
retrieved_docs

[Document(id='6e8a2cbc-afc6-4ce8-89c9-91809a484aa4', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='aa97639d-5f8c-40e0-b27a-57d247a3a401', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='0fa7fbb4-710d-4b5f-bce6-80409e796bd6', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 

In [63]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",  # Important: use models/gemini-pro
    google_api_key= "AIzaSyBLK7t9wbhFD145lmjTIuFzdud0U3rliyA",
    temperature=0.4,
    max_output_tokens=500,
)



In [64]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [65]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [66]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key="AIzaSyBLK7t9wbhFD145lmjTIuFzdud0U3rliyA", convert_system_message_to_human=True)


In [67]:
import google.generativeai as genai

# Configure API key
genai.configure(api_key="AIzaSyBLK7t9wbhFD145lmjTIuFzdud0U3rliyA")

# Create the model instance
model = genai.GenerativeModel(model_name="gemini-2.0-flash")

# Generate content
response = model.generate_content("What is Acromegaly and gigantism?")

# Print response
print(response.text)



Acromegaly and gigantism are both conditions caused by **excessive growth hormone (GH) production**. The key difference lies in **when** this excess GH occurs:

*   **Gigantism** occurs in **childhood** before the growth plates in the bones have closed.
*   **Acromegaly** occurs in **adulthood** after the growth plates have closed.

Here's a more detailed breakdown:

**Gigantism:**

*   **Cause:** Excessive GH production during childhood, almost always due to a non-cancerous tumor (adenoma) on the pituitary gland.
*   **Effect:**  Leads to excessive linear growth, resulting in **abnormally tall stature**. Because the growth plates are still open, the bones can continue to grow longer.
*   **Symptoms:**
    *   Excessive height (often significantly above average for their age and family)
    *   Large hands and feet
    *   Thickening of facial features
    *   Joint pain
    *   Excessive sweating
    *   Delayed puberty
    *   Headaches
    *   Vision problems (if the tumor presses o

In [51]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly is a disorder caused by the abnormal release of a chemical from the pituitary gland in the brain. This leads to increased growth in bone and soft tissue, along with other disturbances throughout the body.


In [52]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])


I'm sorry, but the provided text does not contain information about "stats". The text describes a complete blood count (CBC), which is a series of tests to evaluate the cells in the blood.


In [53]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])

Acne is a skin disorder where the sebaceous glands become inflamed. It is also known as Acne vulgaris.


In [54]:
response = rag_chain.invoke({"input": "what is Diabetes?"})
print(response["answer"])

Diabetes mellitus is a disorder of carbohydrate metabolism brought on by a combination of hereditary and environmental factors. A person with diabetes either does not make enough insulin, or makes insulin that does not work properly. The result is blood sugar that remains high, a condition called hyperglycemia.


In [75]:
response = rag_chain.invoke({"input": "Lump or area of thickening that can be felt under the skin, Weight changes, including unintended loss or gain, Skin changes, such as yellowing, darkening or redness of the skin, sores that won't heal, or changes to existing moles is a symptom of which disease?"})
print(response["answer"])

The provided text discusses moles and skin changes, but it does not mention any specific disease associated with the symptoms you listed. It only mentions that moles with irregular borders, color changes, pain, bleeding, ulceration, or itching should be biopsied.
