In [3]:
%pwd

'c:\\Utsav Jain\\SelfProjects\\MediChatProject\\MediChat\\research'

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Extract text from PDF
def load_pdf_files(path):
    loader = DirectoryLoader(
        path,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [None]:
# extracted_data = load_pdf_files("C:/Utsav Jain/SelfProjects/MediChatProject/MediChat/data")

import os
import joblib

# CACHE_PATH = "extracted_data.joblib"
CACHE_PATH = "C:/Utsav Jain/SelfProjects/MediChatProject/MediChat/research/extracted_data.joblib"

if os.path.exists(CACHE_PATH):
    extracted_data = joblib.load(CACHE_PATH)
    print("Loaded from cache")
else:
    extracted_data = load_pdf_files("C:/Utsav Jain/SelfProjects/MediChatProject/MediChat/data")
    joblib.dump(extracted_data, CACHE_PATH)
    print("Extracted and cached")

In [7]:
len(extracted_data)

4505

In [8]:
extracted_data[400]

Document(metadata={'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'creator': 'Adobe Acrobat 6.0', 'creationdate': '2006-10-16T20:19:33+02:00', 'moddate': '2006-10-16T22:03:45+02:00', 'source': 'C:\\Utsav Jain\\SelfProjects\\MediChatProject\\MediChat\\data\\Medical_book.pdf', 'total_pages': 4505, 'page': 400, 'page_label': '371'}, page_content='OTHER\n‘‘Aortic Stenosis.’’Ochsner Heart and VascularInstitute.\n<http://www.ochsner.org/pedcard/as.htm>.\nRahimtoola, Aly. ‘‘Aortic Stenosis.’’Loyola University\nHealth System Page.<http://www.luhs.org>.\nJeanine Barone, Physiologist\nApgar testing\nDefinition\nApgar testing is the assessment of the newborn\nrating color, heart rate, stimulus response, muscle\ntone, and respirations on a scale of zero to two, for a\nmaximum possible score of 10. It is performed twice,\nfirst at one minute and then again at five minutes after\nbirth.\nPurpose\nApgar scoring was originally developed in the\n1950s by the anesthesiologist Virginia Apgar to assist\npractiti

In [16]:
# import joblib

# # Save
# joblib.dump(extracted_data, "extracted_data.joblib")

# # Load
# # extracted_data = joblib.load("extracted_data.joblib")

['extracted_data.joblib']

In [9]:
# filtering only the required data
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of document objects, return a new list of Docment objects
    containing only the 'source' in metadata and the original page_content
    """

    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        page_num = doc.metadata.get("page")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src, "page": page_num}
            )
        )
    
    return minimal_docs

In [10]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [11]:
minimal_docs[400]

Document(metadata={'source': 'C:\\Utsav Jain\\SelfProjects\\MediChatProject\\MediChat\\data\\Medical_book.pdf', 'page': 400}, page_content='OTHER\n‘‘Aortic Stenosis.’’Ochsner Heart and VascularInstitute.\n<http://www.ochsner.org/pedcard/as.htm>.\nRahimtoola, Aly. ‘‘Aortic Stenosis.’’Loyola University\nHealth System Page.<http://www.luhs.org>.\nJeanine Barone, Physiologist\nApgar testing\nDefinition\nApgar testing is the assessment of the newborn\nrating color, heart rate, stimulus response, muscle\ntone, and respirations on a scale of zero to two, for a\nmaximum possible score of 10. It is performed twice,\nfirst at one minute and then again at five minutes after\nbirth.\nPurpose\nApgar scoring was originally developed in the\n1950s by the anesthesiologist Virginia Apgar to assist\npractitioners attending a birth in deciding whether or\nnot a newborn was in need of resuscitation. Using a\nscoring method fosters consistency and standardiza-\ntion among different practitioners. A Februar

In [12]:
# Split the documents into smaller chunks

def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    text_chunks = text_splitter.split_documents(minimal_docs)
    return text_chunks

In [15]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 40000


In [17]:
# embeddings

from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [18]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [19]:
vector = embedding.embed_query("hello")
vector

[-0.06277173012495041,
 0.0549587644636631,
 0.052164845168590546,
 0.08579001575708389,
 -0.0827489048242569,
 -0.07457298785448074,
 0.06855474412441254,
 0.018396401777863503,
 -0.08201131224632263,
 -0.03738486021757126,
 0.01212488953024149,
 0.003518301760777831,
 -0.004134277813136578,
 -0.043784454464912415,
 0.021807288751006126,
 -0.005102697294205427,
 0.019546590745449066,
 -0.04234873875975609,
 -0.11035966873168945,
 0.005424531176686287,
 -0.05573474243283272,
 0.028052419424057007,
 -0.023158712312579155,
 0.028481345623731613,
 -0.05370963364839554,
 -0.052601564675569534,
 0.033939216285943985,
 0.04538865014910698,
 0.0237184539437294,
 -0.0731208324432373,
 0.054777760058641434,
 0.017047306522727013,
 0.08136036992073059,
 -0.00286271795630455,
 0.011958097107708454,
 0.07355853170156479,
 -0.09423743933439255,
 -0.0813620537519455,
 0.040015410631895065,
 0.0006922061438672245,
 -0.013393313623964787,
 -0.05453810095787048,
 0.005151402670890093,
 -0.0261398162692

In [20]:
len(vector)

384

In [22]:
from dotenv import load_dotenv
import os
load_dotenv()

os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")




In [23]:
# pinecone for vector db

from pinecone import Pinecone
pinecone_api_key = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)
pc

<pinecone.pinecone.Pinecone at 0x1a097755840>

In [25]:
from pinecone import ServerlessSpec

index_name = 'medical-chatbot'

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384, 
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)



In [None]:
# # Storing the vector

# from langchain_pinecone import PineconeVectorStore

# docsearch = PineconeVectorStore.from_documents(
#     documents=texts_chunk,
#     embedding=embedding,
#     index_name=index_name
# )

In [None]:
# Uploading the vector
from langchain_pinecone import PineconeVectorStore

# Check if index has vectors
index = pc.Index(index_name)
stats = index.describe_index_stats()

if stats['total_vector_count'] == 0:
    # First time — upload vectors
    docsearch = PineconeVectorStore.from_documents(
        documents=texts_chunk,
        embedding=embedding,
        index_name=index_name
    )
    print("Vectors uploaded")
else:
    # Already populated — just connect
    docsearch = PineconeVectorStore.from_existing_index(
        index_name=index_name,
        embedding=embedding
    )
    print(f"Connected to existing index {index_name} with ({stats['total_vector_count']} vectors)")

In [27]:
stats = index.describe_index_stats()
stats['total_vector_count']

40000

## Add more data to the existing Pinecone index

In [28]:
new_data = Document(
    page_content="This is the demo data. It has no relation with medical science.",
    metadata={"source":"DemoData", "page": 999999}
)

In [30]:
docsearch.add_documents(documents=[new_data])


['c671a848-571b-4e38-9ce2-c8b9c5c64155']

# LLM Connection

In [33]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [34]:
retrieved_docs = retriever.invoke("what is Acne?")
retrieved_docs

[Document(id='cde2420d-13c0-4e6e-8584-ca86b7a962a5', metadata={'page': 55.0, 'source': 'C:\\Utsav Jain\\SelfProjects\\MediChatProject\\MediChat\\data\\Medical_book.pdf'}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='27ed2640-3ef2-48e1-8460-e245d0f97339', metadata={'page': 55.0, 'source': 'C:\\Utsav Jain\\SelfProjects\\MediChatProject\\MediChat\\data\\Medical_book.pdf'}, page_content='Sebaceous follicles— A structure found within the\nskin that houses the oil-producing glands and hair\nfollicles, where pimples form.\nSebum— An oily skin moisturizer produced by\nsebaceous glands.\nTretinoin— A drug that works by increasing the\nturnover (death and replacement) of skin cells.\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous glands\nbecome inflamed. (Photograph by Biophoto Associates, Photo'),
 Document(id='df8423ea-87d3-4b3f-9958-853f2efffe98', metadata=

In [38]:
# groq model with chat groq
from langchain_groq import ChatGroq

chatModel = ChatGroq(model="groq/compound")


In [37]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    """You are MediChat, a medical information assistant. Your role is to help users understand diseases, symptoms, treatments, and health precautions using the provided context.
    RESPONSE GUIDELINES:
    - Use ONLY the retrieved context to answer questions
    - Keep responses to 3 sentences maximum
    - Be clear, accurate, and concise
    - If the context doesn't contain the answer, say: "I don't have enough information to answer that question. Please consult a healthcare provider."

    MEDICAL GUIDELINES:
    - Never diagnose conditions — only provide general information
    - Never prescribe or recommend specific medications or dosages
    - Never provide emergency medical advice — direct users to call emergency services for urgent situations
    - Always encourage users to consult a qualified healthcare provider for personal medical concerns
    - Present information objectively without causing unnecessary alarm

    BOUNDARIES:
    - Do not answer questions unrelated to health or medicine
    - Do not provide mental health crisis intervention — refer to appropriate helplines
    - Do not make claims beyond what the context supports

    DISCLAIMER:
    You are an informational tool only. You do not replace professional medical advice, diagnosis, or treatment.
    """
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [39]:
# creating the chain
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [42]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])
print(response)

Acne is a common skin disorder that produces pimples, especially on the face, chest, and back. It develops when pores or hair follicles become blocked by sebum (oil), dead skin cells, and bacteria, leading to inflammation of the sebaceous glands. The condition is medically referred to as acne vulgaris.
{'input': 'what is Acne?', 'context': [Document(id='cde2420d-13c0-4e6e-8584-ca86b7a962a5', metadata={'page': 55.0, 'source': 'C:\\Utsav Jain\\SelfProjects\\MediChatProject\\MediChat\\data\\Medical_book.pdf'}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'), Document(id='27ed2640-3ef2-48e1-8460-e245d0f97339', metadata={'page': 55.0, 'source': 'C:\\Utsav Jain\\SelfProjects\\MediChatProject\\MediChat\\data\\Medical_book.pdf'}, page_content='Sebaceous follicles— A structure found within the\nskin that houses the oil-producing glands and hair\nfollicles, where pimples form.\nSebum— An oily skin moisturizer produced by\nsebaceous glands.\nTr

In [43]:
response = rag_chain.invoke({"input": "Tell me about Appendectomy in detail"})
print(response["answer"])

Appendectomy is the surgical removal of the appendix—a worm‑shaped hollow pouch attached to the cecum—performed primarily to treat appendicitis. It can be done via a traditional open approach (a single incision of up to about 7.6 cm in the lower right abdomen) or laparoscopically (four small incisions, each about 2.5 cm, one near the umbilicus, with the appendix separated from surrounding tissue and the cecum closed). Discuss the appropriate technique, preparation, and any precautions with a qualified healthcare provider.
