<a href="https://colab.research.google.com/github/Shruti022/Healthcare-Chatbot/blob/main/Copy_of_LLM_Project_pivot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Project Phase 1: Stepwise API Exploration

Step 1: Import Libraries


In [125]:
!pip install -q requests pandas streamlit pyngrok faiss-cpu sentence-transformers numpy

import requests
import pandas as pd
import json
import hashlib
from datetime import datetime
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

In [126]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [127]:
# Secure KEY INPUT
import os
import getpass

# Securely Capture Key
# Input will be invisible. Paste key and press Enter.
key_input = getpass.getpass("üîë Enter Gemini API Key (Invisible Input): ")

if not key_input.startswith("AIza"):
    print("‚ö†Ô∏è Warning: Key might be invalid (usually starts with 'AIza').")
else:
    print("‚úÖ API Key captured securely in Environment Variable.")

# 2. Set as Environment Variable for the Session
os.environ["GEMINI_API_KEY"] = key_input

üîë Enter Gemini API Key (Invisible Input): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
‚úÖ API Key captured securely in Environment Variable.


In [128]:
%%writefile build_embeddings.py
import pandas as pd
import numpy as np
import faiss
import json
from sentence_transformers import SentenceTransformer

# === REAL PATH (from readlink) ===
BASE = "/content/drive/.shortcut-targets-by-id/1-SiVJhXHTHtDYSrPmW_0VfuP7gSTePcj/data"

# ---------------------------------------------
# Load Data
# ---------------------------------------------
df = pd.read_csv(f"{BASE}/clinical_trials_diabetes_full.csv")

df["status"] = df["status"].astype(str).str.strip().str.title()
bad_status = ["Terminated", "Withdrawn", "Suspended", "No Longer Available", "Unknown"]
df_clean = df[~df["status"].isin(bad_status)].copy()

# ---------------------------------------------
# Chunking
# ---------------------------------------------
chunks = []
chunk_map = []

for idx, row in df_clean.iterrows():
    title = str(row.get("brief_title", "")).strip()
    summary = str(row.get("brief_summary", "")).strip()

    if len(summary) < 20:
        continue

    text = f"Title: {title}\nSummary: {summary}"
    chunks.append(text)

    chunk_map.append({
        "nct_id": row["nct_id"],
        "title": title,
        "text": text,
        "status": row["status"]
    })

print(f"Created {len(chunks)} chunks.")

# ---------------------------------------------
# Embeddings
# ---------------------------------------------
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embed_model.encode(chunks, batch_size=64, show_progress_bar=True)

np.save(f"{BASE}/clinical_trials_diabetes_full_embeddings.npy", embeddings)
print("Saved clinical_trials_diabetes_full_embeddings.npy")

# ---------------------------------------------
# Save chunk map
# ---------------------------------------------
with open(f"{BASE}/clinical_trials_diabetes_full_chunk_map.json", "w") as f:
    json.dump(chunk_map, f)

print("Saved clinical_trials_diabetes_full_chunk_map.json")

# ---------------------------------------------
# Build & Save FAISS
# ---------------------------------------------
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype("float32"))
faiss.write_index(index, f"{BASE}/clinical_trials_diabetes_full_faiss.index")

print("Saved clinical_trials_diabetes_full_faiss.index")
print("‚úÖ Embedding build COMPLETE.")


Overwriting build_embeddings.py


In [129]:
!python build_embeddings.py

2025-11-27 04:03:00.945976: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764216180.977338   58701 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764216180.989850   58701 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764216181.006639   58701 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764216181.006665   58701 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764216181.006669   58701 computation_placer.cc:177] computation placer alr

In [130]:
%%writefile utils.py
import json
import hashlib
from datetime import datetime

import faiss
from sentence_transformers import SentenceTransformer

# --- Confidence score from distance ---

def calculate_confidence_score(distance: float, normalization_factor: float = 1.0) -> float:
    """Inverse L2 distance score in (0,1]; closer = higher confidence."""
    return normalization_factor / (normalization_factor + float(distance))


# --- Load pre-built index + chunk map ---

def load_data_and_index(chunk_map_path: str, faiss_path: str):
    """Loads pre-built chunks and FAISS index for quick startup."""
    print("‚è≥ Loading pre-built RAG index...")

    with open(chunk_map_path, "r") as f:
        chunk_map = json.load(f)

    index = faiss.read_index(faiss_path)

    embed_model = SentenceTransformer("all-MiniLM-L6-v2")

    print(f"‚úÖ RAG Index Ready: {index.ntotal} vectors loaded.")
    return embed_model, index, chunk_map


# --- Provenance logging ---

def log_provenance_step(agent_name: str, input_data, output_data, detail=None):
    """
    Creates a detailed log entry for a single agent step.
    """
    log_entry = {
        "timestamp": datetime.now().isoformat(),
        "agent": agent_name,
        "input": input_data,
        "output": output_data,
        "detail": detail or {},
        "model_version": "gemini-2.0-flash",
    }
    return log_entry


# --- Reproducibility hash ---

def generate_reproducibility_hash(conversation_history, corpus_version: str = "v1.0"):
    """
    Generates a deterministic session hash based on the conversation history.
    """
    queries = [turn.get("query", "") for turn in conversation_history]
    raw = f"{corpus_version}|{'|'.join(queries)}"
    return hashlib.md5(raw.encode("utf-8")).hexdigest()


Overwriting utils.py


In [140]:
%%writefile run_bot.py
import json
import re
import os
from typing import List, Dict, Any
import numpy as np
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

from utils import (
    load_data_and_index,
    calculate_confidence_score,
)

# =============================
# Gemini API Config
# =============================
API_KEY = os.environ.get("GEMINI_API_KEY")
if not API_KEY:
    raise ValueError("‚ùå Missing GEMINI_API_KEY environment variable.")

genai.configure(api_key=API_KEY)
gemini_model = genai.GenerativeModel("models/gemini-2.0-flash")


# =============================
# Paths to FAISS + chunk data
# =============================
CHUNK_PATH = "/content/drive/.shortcut-targets-by-id/1-SiVJhXHTHtDYSrPmW_0VfuP7gSTePcj/data/clinical_trials_diabetes_full_chunk_map.json"
FAISS_PATH = "/content/drive/.shortcut-targets-by-id/1-SiVJhXHTHtDYSrPmW_0VfuP7gSTePcj/data/clinical_trials_diabetes_full_faiss.index"

embed_model, faiss_index, chunk_map = load_data_and_index(CHUNK_PATH, FAISS_PATH)


# =============================
# Tone Manager
# =============================
class ToneManager:
    tone = "professional"

    @classmethod
    def set_tone(cls, tone):
        cls.tone = tone


# =============================
# Query Parser
# =============================
class QueryParser:
    def parse(self, text):
        return {
            "intent": "trial_search",
            "query": text,
            "is_diabetes_related": True
        }


# =============================
# Retriever (FAISS)
# =============================
class RetrievalAgent:
    def __init__(self, embed_model, index, chunk_map):
        self.embed_model = embed_model
        self.index = index
        self.chunk_map = chunk_map

    def retrieve(self, parsed, top_k=5):
        q = parsed["query"]
        q_emb = self.embed_model.encode([q])
        D, I = self.index.search(q_emb.astype("float32"), top_k)

        trials = []
        for r, idx in enumerate(I[0]):
            item = self.chunk_map[idx]
            conf = calculate_confidence_score(D[0][r])

            trials.append({
                "nct_id": item["nct_id"],
                "title": item["title"],
                "text": item["text"],
                "status": item["status"],
                "confidence": conf
            })
        return {"trials": trials}


# =============================
# Trial Summarizer (Improved Structure)
# =============================
class TrialSummarizer:
    def __init__(self, model):
        self.model = model

    def summarize_trial(self, t):
        confidence_pct = round(t["confidence"] * 100)

        prompt = f"""
You are summarizing a diabetes clinical trial for {ToneManager.tone} audience.

NCT ID: {t['nct_id']}
Title: {t['title']}
Status: {t['status']}
Relevance Score: {confidence_pct}%

ClinicalTrials.gov Summary:
{t['text']}

Write exactly this structure:

üìå {t['nct_id']} ‚Äî {t['title']}
Status: {t['status']} | Relevance: {confidence_pct}%

Abstract:
‚Ä¢ Rewrite purpose, population, and intervention (2‚Äì3 sentences)

Key Findings:
‚Ä¢ If results publicly posted ‚Üí summarize outcomes
‚Ä¢ If not posted ‚Üí infer likely findings based on study goal
(No invented numbers)

Takeaway:
‚Ä¢ Single sentence ‚Äî evidence insight but NO medical orders

Keep it short. No disclaimers.
"""

        res = self.model.generate_content(prompt)
        return res.text.strip()

    def summarize(self, trials):
        return "\n\n---\n\n".join(self.summarize_trial(t) for t in trials)


# =============================
# Safety Filter
# =============================
class SafetyFilter:
    def verify(self, text):
        banned = ["stop taking", "diagnose", "prescribe"]
        if any(b in text.lower() for b in banned):
            return "‚ö†Ô∏è Safety revision applied. Information only.", "Revised"
        return text, "Pass"


# =============================
# Bot
# =============================
class HealthcareBot:
    def __init__(self):
        self.parser = QueryParser()
        self.retriever = RetrievalAgent(embed_model, faiss_index, chunk_map)
        self.summarizer = TrialSummarizer(gemini_model)
        self.safety = SafetyFilter()

    def process_query(self, user_input):
        # Greetings
        if user_input.lower().strip() in ["hi", "hello", "hey"]:
            return {
                "recommendation": (
                    "üëã Hi! I summarize real diabetes clinical trials.\n\n"
                    "Ask me things like:\n"
                    "‚Ä¢ GLP-1 trials for type 2 diabetes\n"
                    "‚Ä¢ Diet studies for weight loss\n"
                    "‚Ä¢ Insulin pump trials in adults\n"
                ),
                "cited_trials": []
            }

        parsed = self.parser.parse(user_input)
        retrieved = self.retriever.retrieve(parsed)

        if not retrieved["trials"]:
            return {"recommendation": "No relevant trials found.", "cited_trials": []}

        summaries = self.summarizer.summarize(retrieved["trials"])
        final, status = self.safety.verify(summaries)

        return {
            "recommendation": final,
            "cited_trials": [t["nct_id"] for t in retrieved["trials"]],
            "safety_status": status
        }


# Instance + entrypoint
_bot = HealthcareBot()

def run_bot(user_input: str):
    return _bot.process_query(user_input)


Overwriting run_bot.py


UI frontend application simple web interface

https://docs.streamlit.io/develop/tutorials/chat-and-llm-apps/build-conversational-apps

In [141]:
%%writefile app.py
import streamlit as st
from run_bot import run_bot, ToneManager

st.title("Diabetes Clinical Trial Assistant üî¨")

# Tone selector
mode = st.radio("Audience:", ["Professional", "Patient"], index=0)
ToneManager.set_tone(mode.lower())

if "messages" not in st.session_state:
    st.session_state.messages = []

# Chat history
for m in st.session_state.messages:
    with st.chat_message(m["role"]):
        st.markdown(m["content"])

user_input = st.chat_input("Ask about diabetes clinical trials...")
if user_input:
    st.session_state.messages.append({"role": "user", "content": user_input})

    with st.chat_message("user"):
        st.markdown(user_input)

    result = run_bot(user_input)
    reply = result["recommendation"]

    with st.chat_message("assistant"):
        st.markdown(reply)

    st.session_state.messages.append({"role": "assistant", "content": reply})


Overwriting app.py


In [133]:
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!mv cloudflared-linux-amd64 cloudflared
!chmod +x cloudflared

In [None]:
#AI LLM
!streamlit run app.py &>/dev/null&
!./cloudflared tunnel --url http://localhost:8501 --no-autoupdate

[90m2025-11-27T04:14:59Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2025-11-27T04:14:59Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
[90m2025-11-27T04:15:07Z[0m [32mINF[0m +--------------------------------------------------------------------------------------------+
[90m2025-11-27T04:15:07Z[0m [32mINF[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
[90m2025

In [139]:
import importlib
import run_bot
importlib.reload(run_bot)

print(run_bot.run_bot("trials with weight and diabetes?")["recommendation"])

‚è≥ Loading pre-built RAG index...
‚úÖ RAG Index Ready: 18063 vectors loaded.


2025-11-27 04:14:14.467 200 POST /v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 1737.36ms
2025-11-27 04:14:16.183 200 POST /v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 1712.13ms
2025-11-27 04:14:18.252 200 POST /v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 2065.85ms
2025-11-27 04:14:20.044 200 POST /v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 1787.03ms
2025-11-27 04:14:21.811 200 POST /v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 1762.35ms


üìå NCT01522157 ‚Äî A Randomized Cross-over Trial of the Postprandial Effects of Three Different Diets in Patients With Type 2 Diabetes
Status: Completed | Relevance: 65%

Abstract:
This randomized crossover trial investigated the acute metabolic effects of three diets‚Äîlow-fat, low-carbohydrate, and Mediterranean‚Äîin approximately 20 patients with type 2 diabetes. Participants received each diet on separate days with standardized energy content but varying macronutrient ratios for breakfast and lunch, and blood samples were collected six times daily. The dietary intervention order was randomized.

Key Findings:
Likely findings involve differences in postprandial glucose and lipid responses based on the macronutrient composition of each diet.

Takeaway:
Different dietary macronutrient ratios influence postprandial glucose and lipid metabolism in individuals with type 2 diabetes.

---

üìå NCT00729196 ‚Äî A Trial of Two Diets for Weight and Diabetes Management
Status: Completed | Re