<a href="https://colab.research.google.com/github/Shruti022/Healthcare-Chatbot/blob/main/Copy_of_LLM_Project_pivot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Project Phase 1: Stepwise API Exploration

Step 1: Import Libraries


In [57]:
!pip install -q requests pandas streamlit pyngrok faiss-cpu sentence-transformers numpy

import requests
import pandas as pd
import json
import hashlib
from datetime import datetime
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

In [58]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [98]:
# Secure KEY INPUT
import os
import getpass

# Securely Capture Key
# Input will be invisible. Paste key and press Enter.
key_input = getpass.getpass("üîë Enter Gemini API Key (Invisible Input): ")

if not key_input.startswith("AIza"):
    print("‚ö†Ô∏è Warning: Key might be invalid (usually starts with 'AIza').")
else:
    print("‚úÖ API Key captured securely in Environment Variable.")

# 2. Set as Environment Variable for the Session
os.environ["GEMINI_API_KEY"] = key_input

üîë Enter Gemini API Key (Invisible Input): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
‚úÖ API Key captured securely in Environment Variable.


In [99]:
%%writefile build_embeddings.py
import pandas as pd
import numpy as np
import faiss
import json
from sentence_transformers import SentenceTransformer

# === REAL PATH (from readlink) ===
BASE = "/content/drive/.shortcut-targets-by-id/1-SiVJhXHTHtDYSrPmW_0VfuP7gSTePcj/data"

# ---------------------------------------------
# Load Data
# ---------------------------------------------
df = pd.read_csv(f"{BASE}/clinical_trials_diabetes_full.csv")

df["status"] = df["status"].astype(str).str.strip().str.title()
bad_status = ["Terminated", "Withdrawn", "Suspended", "No Longer Available", "Unknown"]
df_clean = df[~df["status"].isin(bad_status)].copy()

# ---------------------------------------------
# Chunking
# ---------------------------------------------
chunks = []
chunk_map = []

for idx, row in df_clean.iterrows():
    title = str(row.get("brief_title", "")).strip()
    summary = str(row.get("brief_summary", "")).strip()

    if len(summary) < 20:
        continue

    text = f"Title: {title}\nSummary: {summary}"
    chunks.append(text)

    chunk_map.append({
        "nct_id": row["nct_id"],
        "title": title,
        "text": text,
        "status": row["status"]
    })

print(f"Created {len(chunks)} chunks.")

# ---------------------------------------------
# Embeddings
# ---------------------------------------------
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embed_model.encode(chunks, batch_size=64, show_progress_bar=True)

np.save(f"{BASE}/clinical_trials_diabetes_full_embeddings.npy", embeddings)
print("Saved clinical_trials_diabetes_full_embeddings.npy")

# ---------------------------------------------
# Save chunk map
# ---------------------------------------------
with open(f"{BASE}/clinical_trials_diabetes_full_chunk_map.json", "w") as f:
    json.dump(chunk_map, f)

print("Saved clinical_trials_diabetes_full_chunk_map.json")

# ---------------------------------------------
# Build & Save FAISS
# ---------------------------------------------
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype("float32"))
faiss.write_index(index, f"{BASE}/clinical_trials_diabetes_full_faiss.index")

print("Saved clinical_trials_diabetes_full_faiss.index")
print("‚úÖ Embedding build COMPLETE.")


Overwriting build_embeddings.py


In [61]:
!python build_embeddings.py

2025-11-27 03:16:44.477192: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764213404.498662   46953 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764213404.505146   46953 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764213404.520119   46953 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764213404.520152   46953 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764213404.520156   46953 computation_placer.cc:177] computation placer alr

In [101]:
%%writefile utils.py
import json
import hashlib
from datetime import datetime

import faiss
from sentence_transformers import SentenceTransformer

# --- Confidence score from distance ---

def calculate_confidence_score(distance: float, normalization_factor: float = 1.0) -> float:
    """Inverse L2 distance score in (0,1]; closer = higher confidence."""
    return normalization_factor / (normalization_factor + float(distance))


# --- Load pre-built index + chunk map ---

def load_data_and_index(chunk_map_path: str, faiss_path: str):
    """Loads pre-built chunks and FAISS index for quick startup."""
    print("‚è≥ Loading pre-built RAG index...")

    with open(chunk_map_path, "r") as f:
        chunk_map = json.load(f)

    index = faiss.read_index(faiss_path)

    embed_model = SentenceTransformer("all-MiniLM-L6-v2")

    print(f"‚úÖ RAG Index Ready: {index.ntotal} vectors loaded.")
    return embed_model, index, chunk_map


# --- Provenance logging ---

def log_provenance_step(agent_name: str, input_data, output_data, detail=None):
    """
    Creates a detailed log entry for a single agent step.
    """
    log_entry = {
        "timestamp": datetime.now().isoformat(),
        "agent": agent_name,
        "input": input_data,
        "output": output_data,
        "detail": detail or {},
        "model_version": "gemini-2.0-flash",
    }
    return log_entry


# --- Reproducibility hash ---

def generate_reproducibility_hash(conversation_history, corpus_version: str = "v1.0"):
    """
    Generates a deterministic session hash based on the conversation history.
    """
    queries = [turn.get("query", "") for turn in conversation_history]
    raw = f"{corpus_version}|{'|'.join(queries)}"
    return hashlib.md5(raw.encode("utf-8")).hexdigest()


Overwriting utils.py


In [102]:
%%writefile run_bot.py
import json
import re
import os
import sys
from typing import List, Dict, Any
import numpy as np
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

from utils import (
    load_data_and_index,
    log_provenance_step,
    generate_reproducibility_hash,
    calculate_confidence_score,
)

# ==============================
# CONFIG
# ==============================

API_KEY = os.environ.get("GEMINI_API_KEY")
if not API_KEY:
    raise ValueError("‚ùå GEMINI_API_KEY is not set.")

genai.configure(api_key=API_KEY)
gemini_model = genai.GenerativeModel("models/gemini-2.0-flash")

CHUNK_PATH = "/content/drive/.shortcut-targets-by-id/1-SiVJhXHTHtDYSrPmW_0VfuP7gSTePcj/data/clinical_trials_diabetes_full_chunk_map.json"
FAISS_PATH = "/content/drive/.shortcut-targets-by-id/1-SiVJhXHTHtDYSrPmW_0VfuP7gSTePcj/data/clinical_trials_diabetes_full_faiss.index"

embed_model, faiss_index, chunk_map = load_data_and_index(CHUNK_PATH, FAISS_PATH)


# ==============================
# TONE MANAGER
# ==============================
class ToneManager:
    tone = "professional"

    @classmethod
    def set_tone(cls, tone: str):
        cls.tone = tone


# ==============================
# PARSER
# ==============================
class QueryParser:
    def __init__(self, model):
        self.model = model

    def parse(self, text):
        parsed = {
            "intent": "trial_search",
            "query": text,
            "is_diabetes_related": True
        }
        return parsed, log_provenance_step("QueryParser", text, parsed)


# ==============================
# RETRIEVAL AGENT
# ==============================
class RetrievalAgent:
    def __init__(self, embed_model, index, chunk_map):
        self.embed_model = embed_model
        self.index = index
        self.chunk_map = chunk_map

    def retrieve(self, parsed, top_k=5):
        q = parsed["query"]
        emb = self.embed_model.encode([q])
        D, I = self.index.search(emb.astype("float32"), top_k)

        trials = []
        for r, idx in enumerate(I[0]):
            item = self.chunk_map[idx]
            conf = calculate_confidence_score(D[0][r])
            trials.append({
                "nct_id": item["nct_id"],
                "title": item["title"],
                "text": item["text"],
                "status": item["status"],
                "confidence": conf,
            })

        return {"query": q, "trials": trials}, log_provenance_step("RetrievalAgent", parsed, trials)


# ==============================
# SUMMARY GENERATOR (No textwrap)
# ==============================
class TrialSummarizer:
    def __init__(self, model):
        self.model = model

    def summarize_trial(self, trial, tone):

        prompt = f"""
You are summarizing a diabetes clinical trial for healthcare relevance.

NCT: {trial['nct_id']}
Title: {trial['title']}
Status: {trial['status']}
Registry Summary:
{trial['text']}

Write:
1Ô∏è‚É£ Abstract ‚Äî rewrite in {tone} tone, 2‚Äì3 sentences
2Ô∏è‚É£ Key Findings ‚Äî infer expected results based on trial goal (NO invented numbers)
3Ô∏è‚É£ Takeaway ‚Äî single clinically relevant sentence

No recommendations for medication changes.
Be accurate, concise, and structured.
"""

        response = self.model.generate_content(prompt)
        return response.text.strip()

    def summarize(self, trials):
        return "\n\n---\n\n".join(
            self.summarize_trial(t, ToneManager.tone) for t in trials
        )


# ==============================
# SAFETY FILTER
# ==============================
class SafetyFilter:
    def verify(self, text):
        banned = ["stop taking", "prescribe", "diagnose"]
        if any(x in text.lower() for x in banned):
            return "‚ö†Ô∏è Safety revision: consult your clinician.", "Revised"
        return text, "Pass"


# ==============================
# BOT
# ==============================
class HealthcareBot:
    def __init__(self):
        self.parser = QueryParser(gemini_model)
        self.retriever = RetrievalAgent(embed_model, faiss_index, chunk_map)
        self.summarizer = TrialSummarizer(gemini_model)
        self.safety = SafetyFilter()

    def process_query(self, user_input):

        if user_input.lower() in ["hi", "hello", "hey"]:
            return {
                "recommendation": (
                    "üëã Hi! I find and summarize **real diabetes clinical trials**.\n\n"
                    "**Try asking:**\n"
                    "‚Ä¢ GLP-1 trials for type 2 diabetes\n"
                    "‚Ä¢ diet studies for weight loss in diabetes\n"
                    "‚Ä¢ insulin pump trials for adults\n"
                )
            }

        parsed, _ = self.parser.parse(user_input)
        retrieved, _ = self.retriever.retrieve(parsed)

        if not retrieved["trials"]:
            return {"recommendation": "No matching trials found."}

        summary_text = self.summarizer.summarize(retrieved["trials"])
        final_text, _ = self.safety.verify(summary_text)

        return {"recommendation": final_text}


_bot = HealthcareBot()

def run_bot(user_input):
    return _bot.process_query(user_input)


Overwriting run_bot.py


UI frontend application simple web interface

https://docs.streamlit.io/develop/tutorials/chat-and-llm-apps/build-conversational-apps

In [103]:
%%writefile app.py
import streamlit as st
from run_bot import run_bot, ToneManager

st.title("Diabetes Clinical Trial Assistant üî¨")

# Tone selector
mode = st.radio("Audience:", ["Professional", "Patient"], index=0)
ToneManager.set_tone(mode.lower())

if "messages" not in st.session_state:
    st.session_state.messages = []

# Chat history
for m in st.session_state.messages:
    with st.chat_message(m["role"]):
        st.markdown(m["content"])

user_input = st.chat_input("Ask about diabetes clinical trials...")
if user_input:
    st.session_state.messages.append({"role": "user", "content": user_input})

    with st.chat_message("user"):
        st.markdown(user_input)

    result = run_bot(user_input)
    reply = result["recommendation"]

    with st.chat_message("assistant"):
        st.markdown(reply)

    st.session_state.messages.append({"role": "assistant", "content": reply})


Overwriting app.py


In [65]:
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!mv cloudflared-linux-amd64 cloudflared
!chmod +x cloudflared

In [66]:
#AI LLM
!streamlit run app.py &>/dev/null&
!./cloudflared tunnel --url http://localhost:8501 --no-autoupdate

[90m2025-11-27T03:17:36Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2025-11-27T03:17:36Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
[90m2025-11-27T03:17:40Z[0m [32mINF[0m +--------------------------------------------------------------------------------------------+
[90m2025-11-27T03:17:40Z[0m [32mINF[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
[90m2025

In [104]:
import importlib
import run_bot
importlib.reload(run_bot)

print(run_bot.run_bot("GLP-1 trials for type 2 diabetes")["recommendation"])

‚è≥ Loading pre-built RAG index...
‚úÖ RAG Index Ready: 18063 vectors loaded.


2025-11-27 03:37:19.028 400 POST /v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 328.26ms


BadRequest: 400 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: API Key not found. Please pass a valid API key.