In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("ncbi/MedCPT-Cross-Encoder")
rerank_model = AutoModelForSequenceClassification.from_pretrained("ncbi/MedCPT-Cross-Encoder").to(device)

def rerank_with_medcpt(query, passages):
    inputs = tokenizer(
        [f"[Q] {query} [D] {p}" for p in passages],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        logits = rerank_model(**inputs).logits.squeeze()
        scores = logits.cpu().numpy().tolist()

    return scores

# Step 1: Rerank
scores = rerank_with_medcpt(user_query, retrieved_texts)

# Step 2: Sort by descending score
ranked_results = sorted(
    zip(retrieved_texts, retrieved_metas, scores),
    key=lambda x: x[2],
    reverse=True
)

# Step 3: Show top results
for i, (text, meta, score) in enumerate(ranked_results[:]):
    print(f"\n🔹 Reranked #{i+1} — Score: {score:.4f}")
    print("📄 Metadata:", meta)
    print("📜 Content Preview:", text[:300], "...")

# How many top reranked chunks to include?
top_n = 5

# System prompt
system_prompt = (
    "You are a helpful clinical assistant. Use only the provided context to answer.\n"
    "Always cite the source and page number from the metadata."
)

# Build formatted context
context_blocks = []
for i, (text, meta, score) in enumerate(ranked_results[:top_n]):
    context = f"---\nSource: {meta['source']}, Page: {meta['page']}, Section: {meta.get('section', '')}\n{text.strip()}"
    context_blocks.append(context)

# Combine everything
final_prompt = f"{system_prompt}\n\nUser Query:\n{user_query}\n\nContext:\n" + "\n\n".join(context_blocks)

print("🧾 Final Prompt for LLM:\n")
print(final_prompt)  # truncate preview if too long

In [3]:
!pip install -q sentence-transformers faiss-cpu

In [4]:
!nvidia-smi

Wed Aug  6 17:22:01 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8             11W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [None]:
from huggingface_hub import login

login(token=" ")

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")

2025-08-06 17:22:16.934738: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754500937.183247      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754500937.263850      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
import json

texts = []
metadatas = []

with open("/kaggle/input/chunks/page_chunks_Pocket_book_of_hospital_care.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        texts.append(obj["content"])
        metadatas.append(obj["metadata"])

In [8]:
embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

In [9]:
import faiss

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("✅ FAISS index has", index.ntotal, "vectors.")

✅ FAISS index has 438 vectors.


In [10]:
import pickle
import faiss

# Save FAISS index
faiss.write_index(index, "faiss_index.index")

# Save texts and metadata (must align with index)
with open("texts.pkl", "wb") as f:
    pickle.dump(texts, f)

with open("metadatas.pkl", "wb") as f:
    pickle.dump(metadatas, f)

In [11]:
# Example user query
user_query = "What are the specific conditions under which a child with pneumonia should not be given intravenous ampicillin and gentamicin?"

# Step 1: Embed the query
query_vector = model.encode([user_query], convert_to_numpy=True)

# Step 2: Search top-k similar chunks
top_k = 10
D, I = index.search(query_vector, k=top_k)

# Step 3: Get top-k texts and metadata
retrieved_texts = [texts[i] for i in I[0]]
retrieved_metas = [metadatas[i] for i in I[0]]

# Step 4: Display
for rank, (text, meta) in enumerate(zip(retrieved_texts, retrieved_metas), 1):
    print(f"\n🔹 Result #{rank}")
    print("📄 Metadata:", meta)
    print("📜 Content Preview:", text[:300], "...")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🔹 Result #1
📄 Metadata: {'source': 'POCKET BOOK OF Hospital care', 'page': 107, 'section': 'SEVERE PNEUMONIA'}
📜 Content Preview: SEVERE PNEUMONIA

SEVERE PNEUMONIA

## Supportive care

* Remove by gentle suction any thick secretions at the entrance to the nasal passages or throat, which the child cannot clear.
* If the child has fever (≥ 39 °C or ≥ 102.2 °F) which appears to be causing distress, give paracetamol.
* If wheeze  ...

🔹 Result #2
📄 Metadata: {'source': 'POCKET BOOK OF Hospital care', 'page': 79, 'section': 'MENINGITIS\n\n* For newborns with any signs of serious bacterial infection or sepsis, give ampicillin (or penicillin) and gentamicin as first-line antibiotic treatment (for dosages see pp. 69–72)\n* If at greater risk of staphylococcus infection (extensive skin pustules, abscess or omphalitis in addition to signs of sepsis), give IV cloxacillin and gentamicin.\n* The most serious bacterial infections in newborns should be treated with antibiotics for at least 7–10 day

In [12]:
!pip install -q google-generativeai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import os
import google.generativeai as genai

os.environ["GOOGLE_API_KEY"] = " "
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [12]:
# Format LLM prompt
context_blocks = []
for i in range(len(retrieved_texts)):
    context_blocks.append(
        f"---\nSource: {retrieved_metas[i]['source']}, Page: {retrieved_metas[i]['page']}\n{retrieved_texts[i].strip()}"
    )

system_prompt = (
    "You are a helpful clinical assistant. Use only the provided context to answer.\n"
    "Always properly cite the source and page number from the metadata adn the provided context. The response should be easly readable by a person."
)

final_prompt = f"{system_prompt}\n\nUser Query:\n{user_query}\n\nContext:\n" + "\n\n".join(context_blocks)

In [13]:
model_ai = genai.GenerativeModel("gemini-2.5-flash")

response = model_ai.generate_content(final_prompt)

print("🤖 Gemini Answer:\n")
print(response.text)

🤖 Gemini Answer:

A child with pneumonia should not be given intravenous ampicillin and gentamicin under the following specific conditions:

1.  **Suspected Staphylococcal Pneumonia with No Improvement after 48 Hours:** If a child with pneumonia does not show signs of improvement within 48 hours and staphylococcal pneumonia is suspected, treatment should switch to cloxacillin and gentamicin. Signs suggesting staphylococcal pneumonia include rapid clinical deterioration despite treatment, a pneumatocoele or pneumothorax with effusion on chest X-ray, numerous Gram-positive cocci in a sputum smear, heavy growth of *S. aureus* in cultured sputum or empyema fluid, or the presence of septic skin pustules.
    *   Source: POCKET BOOK OF Hospital care, Page: 106, 107

2.  **Failure of First-Line Treatment:** In cases where the initial first-line treatment (intravenous ampicillin and gentamicin) fails, ceftriaxone should be used as an alternative.
    *   Source: POCKET BOOK OF Hospital care, P

In [14]:
# rag_incremental_indexer.py

import json
import pickle
import faiss
from sentence_transformers import SentenceTransformer
import google.generativeai as genai

class RAGIndexer:
    def __init__(self, 
                 faiss_index_path="faiss_index.index", 
                 texts_path="texts.pkl", 
                 metadatas_path="metadatas.pkl", 
                 embedding_model_name="NeuML/pubmedbert-base-embeddings",
                 gemini_model_name="gemini-2.5-flash"):

        self.faiss_index_path = faiss_index_path
        self.texts_path = texts_path
        self.metadatas_path = metadatas_path
        self.gemini_model_name = gemini_model_name

        # Load embedding model
        self.model = SentenceTransformer(embedding_model_name)

        # Try to load existing index and data
        try:
            self.index = faiss.read_index(self.faiss_index_path)
            with open(self.texts_path, "rb") as f:
                self.texts = pickle.load(f)
            with open(self.metadatas_path, "rb") as f:
                self.metadatas = pickle.load(f)
            length = len(self.metadatas)
            print(f"✅ Loaded existing index and data {len(self.metadatas)}")
        except:
            self.index = None
            self.texts = []
            self.metadatas = []
            print("⚠️ Starting with empty index.")

    def update_with_jsonl(self, jsonl_path):
        # Load new JSONL file
        new_texts = []
        new_metas = []
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                new_texts.append(obj["content"])
                new_metas.append(obj["metadata"])

        # Embed new content
        new_embeddings = self.model.encode(new_texts, convert_to_numpy=True)

        # Initialize index if needed
        if self.index is None:
            dim = new_embeddings.shape[1]
            self.index = faiss.IndexFlatL2(dim)

        # Add to index and local lists
        self.index.add(new_embeddings)
        self.texts.extend(new_texts)
        self.metadatas.extend(new_metas)

        # Save everything back
        faiss.write_index(self.index, self.faiss_index_path)
        with open(self.texts_path, "wb") as f:
            pickle.dump(self.texts, f)
        with open(self.metadatas_path, "wb") as f:
            pickle.dump(self.metadatas, f)

        print(f"✅ Added {len(new_texts)} chunks from {jsonl_path}")

    def search(self, query, top_k=10):
        # Encode query
        query_vec = self.model.encode([query], convert_to_numpy=True)

        # Search
        D, I = self.index.search(query_vec, top_k)

        return [(self.texts[i], self.metadatas[i], D[0][rank]) for rank, i in enumerate(I[0])]

    def generate_answer(self, query, top_k=3, api_key=None):
        if api_key:
            genai.configure(api_key=api_key)

        model = genai.GenerativeModel(self.gemini_model_name)
        results = self.search(query, top_k=top_k)

        context_blocks = []
        for text, meta, score in results:
            context_blocks.append(
                f"Page: {meta['page']}, Section: {meta.get('section', '')}\n{text.strip()}"
            )

        system_prompt = (
            "You are a helpful clinical assistant. Use only the provided context to answer.\n"
            "Always cite the source and page number from the metadata."
        )

        final_prompt = f"{system_prompt}\n\nUser Query:\n{query}\n\nContext:\n" + "\n\n".join(context_blocks)

        response = model.generate_content(final_prompt)
        return response.text


# Example usage:
# indexer = RAGIndexer()
# indexer.update_with_jsonl("value_plus_md_chunks_NEW.jsonl")
# answer = indexer.generate_answer("What is the dosage of ampicillin for a neonate?", api_key="your-gemini-api-key")
# print(answer)

In [None]:

indexer = RAGIndexer()
answer = indexer.update_with_jsonl("/kaggle/input/chunks/page_chunks_Integrated_management.jsonl")
chat = indexer.generate_answer(
    query = "A 10-month-old infant is brought to the clinic with a cough that has lasted for 5 days. The breathing rate is 55 breaths per minute, and the child is observed to have chest indrawing, but no stridor. The child's mother states that the child is also lethargic. What is the correct classification for this child's condition, the immediate course of treatment, and what is the required follow-up visit schedule?",
    top_k = 5,
    api_key = " "  # Replace with your actual API key
)

print(chat)

✅ Loaded existing index and data 438


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Added 80 chunks from /kaggle/input/chunks/page_chunks_Integrated_management.jsonl


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Based on the provided context:

**1. Correct classification for this child's condition:**
The child is 10 months old.
*   Breathing rate is 55 breaths per minute, which is considered fast breathing for a child aged 2–11 months (≥ 50 breaths per minute). (Page 76, 6)
*   The child has chest indrawing. (Page 6)
*   The context states that "Chest indrawing or Fast breathing" classifies as **PNEUMONIA**. (Page 6)
Although the child is lethargic, the provided context does not explicitly define "lethargy" as a "general danger sign" to classify the condition as "SEVERE PNEUMONIA OR VERY SEVERE DISEASE". Therefore, based strictly on the explicit criteria in the provided tables, the classification is PNEUMONIA.

**2. Immediate course of treatment:**
For a child classified with PNEUMONIA:
*   Give oral Amoxicillin for 5 days. (Page 6)
*   Soothe the throat and relieve the cough with a safe remedy. (Page 6)
*   Advise the mother when to return immediately. (Page 6)

**3. Required follow-up visit 

In [49]:
from collections import defaultdict
from google.generativeai import GenerativeModel

chat_memory = defaultdict(list)

In [50]:
import numpy as np

In [51]:
def is_follow_up(query, last_query, api_key):
    prompt = f"""
Determine if the second question is a follow-up to the first.

Q1: "{last_query}"
Q2: "{query}"

Respond only with YES or NO.
"""
    checker = GenerativeModel("gemini-2.5-flash")
    response = checker.generate_content(prompt)
    return "yes" in response.text.strip().lower()

In [52]:
def reformulate_query(query, last_query, api_key):
    prompt = f"""
Rephrase the second question so that it becomes a standalone version using context from the first.

Q1: "{last_query}"
Q2: "{query}"

Standalone version:
"""
    model = GenerativeModel("gemini-2.5-flash")
    response = model.generate_content(prompt)
    return response.text.strip().strip('"')

In [53]:
def retrieve_chunks(query, indexer, top_k=5):
    embedding = indexer.model.encode([query])[0]
    _, I = indexer.index.search(np.array([embedding]), top_k)
    chunks = [indexer.texts[i] for i in I[0]]
    metadatas = [indexer.metadatas[i] for i in I[0]]

    # print("\n🔍 Retrieved Chunks:")
    # for idx, (chunk, meta) in enumerate(zip(chunks, metadatas), 1):
    #     print(f"\n--- Chunk #{idx} ---")
    #     print(f"📄 Source: {meta.get('source')} | Page: {meta.get('page')} | Section: {meta.get('section', 'N/A')}")
    #     print(f"📚 Content:\n{chunk[:500]}...")  # trim long text
    return chunks, metadatas

In [54]:
def generate_response(query, chunks, history, api_key):
    history_text = ""
    for turn in history[-5:]:
        history_text += f"User: {turn['query']}\nAssistant: {turn['answer']}\n"

    context_text = "\n\n".join(chunks)

    prompt = f"""
You are a helpful medical assistant. Use only the provided context below and the chat history (if any) to answer the current question.
Also you have to provide the source and the accurate page from the context that you are using to answer.

{history_text}
Context:
{context_text}

User: {query}
Assistant:"""

    model = GenerativeModel("gemini-2.5-flash")
    response = model.generate_content(prompt)
    return response.text

In [64]:
def log_turn(session_id, query, answer):
    chat_memory[session_id].append({"query": query, "answer": answer})


In [70]:
def conversational_rag_pipeline(query, session_id, indexer, api_key):
    history = chat_memory[session_id]
    
    # Step 1: Handle follow-up logic
    if history:
        last_query = history[-1]["query"]
        print(last_query)
        if is_follow_up(query, last_query, api_key):
            print("yes")
            chat_memory[session_id] = []
            rewritten_query = reformulate_query(query, last_query, api_key)
        else:
            rewritten_query = query
            history = []  # discard history for unrelated question
    else:
        rewritten_query = query

    # Step 2: Retrieve and generate
    chunks, metas = retrieve_chunks(rewritten_query, indexer)
    answer = generate_response(query, chunks, history, api_key)

    # Step 3: Store the turn
    log_turn(session_id, query, answer)
    
    return answer

In [71]:
chat_memory["demo"] = []

In [None]:
# First turn
response_1 = conversational_rag_pipeline(
    query="What is the dosage of ampicillin for neonates?",
    session_id="demo",
    indexer=indexer,
    api_key=" "
)
print("❓ Q1 Answer:\n", response_1)

# Follow-up turn
response_2 = conversational_rag_pipeline(
    query="And what if the child is also severely malnourished? then what wil be the dosage of ampicillin?",
    session_id="demo",
    indexer=indexer,
    api_key=" "  # Replace with your actual API key
)
print("\n❓ Q2 Answer:\n", response_2)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

❓ Q1 Answer:
 The dosage of ampicillin for young infants (neonates) is 50 mg per kg.

Source: TREAT AND COUNSEL, TREAT THE YOUNG INFANT, GIVE FIRST DOSE OF INTRAMUSCULAR ANTIBIOTICS, page 69.
What is the dosage of ampicillin for neonates?
yes


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


❓ Q2 Answer:
 If the child is severely malnourished and has complications (such as hypoglycaemia, hypothermia, lethargy, or other medical complications), the dosage for ampicillin is 50 mg per kg, given intramuscularly (IM) or intravenously (IV) every 6 hours for 2 days.

Source: INFECTION, 7.4.5 Infection, Choice of broad-spectrum antibiotics, page 207.


In [None]:
response_3 = conversational_rag_pipeline(
    query="what are the steps and process of Neonatal resuscitation",
    session_id="demo",
    indexer=indexer,
    api_key=" "
)
print("\n❓ Q3 Answer:\n", response_3)

And what if the child is also severely malnourished? then what wil be the dosage of ampicillin?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


❓ Q3 Answer:
 The steps and process of Neonatal resuscitation are as follows:

**Initial Steps:**
*   Dry the infant immediately with a clean cloth.
*   Keep warm by skin-to-skin contact and covered.

**Assessment:**
*   Look for: Breathing or crying, good muscle tone or vigorous movements.
*   If not present:
    *   Stimulate by rubbing the back 2 to 3 times.
    *   Suction only if there was meconium-stained liquor and the infant is not crying and moving limbs, or if the mouth or nose is full of secretions with clear amniotic fluid. (Avoid routine suctioning or deep suctioning).

**If Not Breathing or Gasping:**
*   CALL FOR HELP.
*   Transfer to newborn resuscitation area.
*   Position the head/neck slightly extended to open the airway.
*   Start positive pressure ventilation with a mask and self-inflating bag within 1 minute of birth. (Use air for infants > 32 weeks gestation, or 30% oxygen for very preterm infants if possible).

**A. Airway:**
*   Keep the infant's head in a sli

In [None]:
# First turn
response_4 = conversational_rag_pipeline(
    query="when to consider discontinuing after effective resuscitation efforts",
    session_id="demo",
    indexer=indexer,
    api_key=" "
)
print("❓ Q4 Answer:\n", response_4)


what are the steps and process of Neonatal resuscitation
yes


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

❓ Q1 Answer:
 It is appropriate to consider discontinuing after effective resuscitation efforts if:
*   The infant is not breathing and heartbeat is not detectable beyond 10 minutes.
*   If there is no spontaneous breathing and the heart rate remains below 60/min after 20 minutes of effective resuscitation.

**Source:** 3.2.2 Cessation of resuscitation, page 50.


In [None]:
# First turn
response_4 = conversational_rag_pipeline(
    query=" the air escaping from the mask what to do",
    session_id="demo",
    indexer=indexer,
    api_key=" "
)
print("❓ Q4 Answer:\n", response_4)

In [27]:
chat_log = defaultdict(list)

In [28]:
def log_turn(session_id, query, answer):
    chat_log[session_id].append({
        "query": query,
        "answer": answer
    })

In [29]:
active_context = defaultdict(list)

In [30]:
def update_context(session_id, query, api_key):
    history = chat_log[session_id]
    
    if not history:
        return [], query

    last_query = history[-1]["query"]
    if is_follow_up(query, last_query, api_key):
        rewritten = reformulate_query(query, last_query, api_key)
        return active_context[session_id], rewritten
    else:
        # Not a follow-up → but keep full history safe
        active_context[session_id] = []
        return [], query

In [31]:
def conversational_rag_pipeline(query, session_id, indexer, api_key):
    context_turns, rewritten_query = update_context(session_id, query, api_key)

    # Step 2: Retrieve relevant knowledge
    chunks, metas = retrieve_chunks(rewritten_query, indexer)

    # Step 3: Generate response using only the context window
    answer = generate_response(query, chunks, context_turns, api_key)

    # Step 4: Store full history
    log_turn(session_id, query, answer)

    # Step 5: Update context window if it's a follow-up
    if context_turns:
        active_context[session_id].append({"query": query, "answer": answer})

    return answer