In [52]:
# Document Loaders
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader
# Text Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from typing import List
from langchain.schema import Document
# Vector stores
from langchain_community.vectorstores import Pinecone
from pinecone import ServerlessSpec,Pinecone
from langchain_pinecone import PineconeVectorStore
# Retrievers
from langchain.retrievers.multi_query import MultiQueryRetriever
# Prompts
from langchain_core.messages import HumanMessage,SystemMessage,AIMessage
from langchain_core.prompts import PromptTemplate
# Output parser
from langchain_core.output_parsers import StrOutputParser
# Open Source Models
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
# Loading data from .env files
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
# helper
llm = ChatGroq(
    model="openai/gpt-oss-120b"
)

In [None]:
# helper
embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={'normalize_embeddings': True}
)

In [6]:
%pwd

'/home/sayandip-saha/Desktop/CODING/GEN_AI/Projects/Medical-Chatbot/research'

In [9]:
import os
os.chdir('../')

In [10]:
%pwd

'/home/sayandip-saha/Desktop/CODING/GEN_AI/Projects/Medical-Chatbot'

In [None]:
# helper
# Load PDF Files
def load_pdf(folder):
    loader = DirectoryLoader(
        folder,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    docs = loader.load()
    return docs

In [12]:
docs = load_pdf("data")

In [14]:
len(docs)

637

In [16]:
docs[0]

Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data/Medical_book.pdf', 'total_pages': 637, 'page': 0, 'page_label': '1'}, page_content='')

In [None]:
# helper
def filter_documents(docs):
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={'source':src}
            )
        )
    return minimal_docs

In [18]:
filtered_docs = filter_documents(docs)

In [None]:
# helper
def chunk_text_semantically(docs):
    # use semantic chunker
    splitter = SemanticChunker(
        embedding,breakpoint_threshold_type="standard_deviation",
        breakpoint_threshold_amount=1
    )
    chunks = splitter.split_documents(docs)
    return chunks

In [35]:
result = chunk_text_semantically(filtered_docs)

In [36]:
cnt = 1
for split in result:
    print(f'split: {cnt}\n {split.page_content}\n')
    cnt+=1

split: 1
 

split: 2
 The GALE
ENCYCLOPEDIA
of MEDICINE
SECOND EDITION

split: 3
 The GALE
ENCYCLOPEDIA
of MEDICINE
SECOND EDITION
JACQUELINE L. LONGE, EDITOR
DEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR
VOLUME
A-B
1

split: 4
 STAFF
Jacqueline L. Longe, Project Editor
Deirdre S. Blanchfield, Associate Editor
Christine B. Jeryan, Managing Editor
Donna Olendorf, Senior Editor
Stacey Blachford, Associate Editor
Kate Kretschmann, Melissa C. McDade, Ryan
Thomason, Assistant Editors
Mark Springer, Technical Specialist
Andrea Lopeman, Programmer/Analyst
Barbara J. Yarrow,Manager, Imaging and Multimedia
Content
Robyn V . Young,Project Manager, Imaging and
Multimedia Content
Dean Dauphinais, Senior Editor, Imaging and
Multimedia Content
Kelly A. Quin, Editor, Imaging and Multimedia Content
Leitha Etheridge-Sims, Mary K. Grimes, Dave Oblender,
Image Catalogers
Pamela A. Reed, Imaging Coordinator
Randy Bassett, Imaging Supervisor
Robert Duncan, Senior Imaging Specialist
Dan Newell, Imaging Specialis

In [57]:
result

[Document(metadata={'source': 'data/Medical_book.pdf', 'text': ''}, page_content=''),
 Document(metadata={'source': 'data/Medical_book.pdf', 'text': 'The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'source': 'data/Medical_book.pdf', 'text': 'The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 Document(metadata={'source': 'data/Medical_book.pdf', 'text': 'STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\n

In [58]:
import pickle

with open("semantic_chunks.pkl", "wb") as f:
    pickle.dump(result, f)

In [None]:
# Load back
# with open("semantic_chunks.pkl", "rb") as f:
#     chunks = pickle.load(f)

In [45]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [None]:
# helper
def create_database(index_name,chunks,embedding_model):
    pc_api_key = PINECONE_API_KEY
    pc = Pinecone(api_key=pc_api_key)
    # pc = Pinecone()
    index_name=index_name
    if not pc.has_index(index_name):
        pc.create_index(
            name = index_name,
            dimension=384,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws",region="us-east-1")
        )
    index = pc.Index(index_name)
    docsearch = PineconeVectorStore.from_documents(
        documents = chunks,
        embedding = embedding_model,
        index_name=index_name
    )   
    return docsearch

In [None]:
vector_store = create_database(index_name="medical-chatbot",chunks=result,embedding_model=embedding)

In [None]:
similarity_retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k":3}
)

In [63]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [64]:
compressor = LLMChainExtractor.from_llm(llm)

In [65]:
retriever = ContextualCompressionRetriever(
    base_retriever=similarity_retriever,
    base_compressor=compressor
)

In [66]:
retrieved_docs = retriever.invoke("What is Acne?")

In [67]:
cnt=1
for doc in retrieved_docs:
    print(f'Result: {cnt}\nContent:\n{doc.page_content}\n\n')
    cnt += 1

Result: 1
Content:
Acne
Definition
Acne is a common skin disease characterized by
pimples on the face, chest, and back. It occurs when the
pores of the skin become clogged with oil, dead skin
cells, and bacteria. Description
Acne vulgaris, the medical term for common acne, is
the most common skin disease. It affects nearly 17 million
people in the United States. While acne can arise at any
age, it usually begins at puberty and worsens during ado-
lescence. Nearly 85% of people develop acne at some time
between the ages of 12-25 years. Up to 20% of women
develop mild acne. It is also found in some newborns. The sebaceous glands lie just beneath the skin’s sur-
face. They produce an oil called sebum, the skin’s natural
moisturizer. These glands and the hair follicles within
which they are found are called sebaceous follicles. These follicles open onto the skin through pores. At
puberty, increased levels of androgens (male hormones)
cause the glands to produce too much sebum. When
excess 

In [68]:
from langchain.schema.runnable import RunnableParallel,RunnablePassthrough,RunnableLambda

In [69]:
parser = StrOutputParser()

In [70]:
template = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        "You are **MedGuide**, a professional, empathetic medical support chatbot designed "
        "to help users understand health conditions, symptoms, and possible treatments.\n\n"
        
        "You have access to a trusted medical knowledge base and retrieved context from documents.\n"
        "Use this context to provide accurate, well-structured, and concise answers.\n\n"
        
        "⚠️ **Important Rules:**\n"
        "- Do NOT provide a confirmed diagnosis.\n"
        "- Do NOT recommend specific medications or dosages.\n"
        "- Always encourage users to consult a certified healthcare provider for medical advice.\n"
        "- Maintain a supportive and respectful tone.\n\n"
        
        "📚 **Context from medical knowledge base:**\n"
        "{context}\n\n"
        
        "💬 **User Question:** {question}\n\n"
        
        "🧠 **Your Response:**\n"
        "Provide a clear, informative, and compassionate explanation based on the context. "
        "If the information is not found in the context, state that you are unsure and suggest "
        "consulting a medical professional."
    )
)


In [71]:
def format_context(context):
    context_text = "\n\n".join(doc.page_content for doc in context)
    return context_text

In [72]:
parallel_chain = RunnableParallel({
    "question": RunnablePassthrough(),
    "context": retriever | RunnableLambda(format_context)
})

In [92]:
chain = parallel_chain | template | llm | parser 

In [93]:
query = "What is austioporosis?"

In [94]:
ans = chain.invoke(query)

In [100]:
import re

def markdown_to_text(md_text: str) -> str:
    # Remove bold and italics
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', md_text)
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    # Replace bullet points
    text = text.replace("- ", "• ")
    # Replace escaped newlines
    text = text.replace("\\n", "\n")
    return text.strip()

formatted_text = markdown_to_text(ans)
print(formatted_text)


Osteoporosis (you may have meant “osteoporosis”) is a condition in which the bones become porous, weak, and less dense. Because the bone tissue breaks down faster than it is rebuilt, the skeleton loses strength and is more prone to fractures. Healing after a fracture can also take longer.

Key points from the medical literature

| Aspect | Details |
|--------|---------|
| Who is most affected? | • Women after menopause (typically around age 50) <br>• Older men (often in later life) |
| What happens to the bones? | The bone mass declines, making the bones “very porous and weak.” |
| How is it measured? | A bone‑density test (DXA) gives a T‑score. <br>• A T‑score below –2.5 (shown in the source as “/H110022.5”) indicates that osteoporosis is already present. |
| Why is it a concern? | Weaker bones are more likely to fracture, especially in the hip, spine, and wrist, and fractures can lead to reduced mobility and a longer recovery period. |
| What can increase risk? | • Low calcium or vit