<a href="https://colab.research.google.com/github/Tar-ive/BCRC/blob/main/bcrc_system_prompt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install anthropic

Collecting anthropic
  Downloading anthropic-0.54.0-py3-none-any.whl.metadata (25 kB)
Downloading anthropic-0.54.0-py3-none-any.whl (288 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.8/288.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.54.0


In [None]:
# BCRC AI Assistant with Transparent Search Process and Markdown Export
# For use in Google Colab

import anthropic
import os
from typing import Dict, List, Any, Optional
import json
import re
from datetime import datetime
import pandas as pd
from IPython.display import display, HTML
from google.colab import userdata

# --- AUTHENTICATION SETUP ---
# In Google Colab, create a secret named 'ANTHROPIC_API_KEY' with your API key.
# from google.colab import userdata
api_key = userdata.get('ANTHROPIC_API_KEY')

# --- CLIENT INITIALIZATION (FIXED) ---
# The API key must be passed to the client during initialization.
client = anthropic.Anthropic(api_key=api_key)


# --- SYSTEM PROMPT ---
# System prompt that encourages transparent search process
SYSTEM_PROMPT = """You are an AI assistant specialized in answering questions about breast cancer, built for BCRC (Breast Cancer Resource Center). Your primary goal is to provide accurate, helpful, and empathetic information using verified sources.

You have access to the web_search tool to search verified breast cancer websites.

IMPORTANT: Show your search process transparently. For each search:
1. Explain what you're searching for and why.
2. The user will see the search query you use.
3. After receiving results, explain what you found and synthesize the information.
4. Use specific quotes from the sources with proper citations.

Guidelines:
- Maintain a conversational, empathetic tone.
- Avoid excessive medical jargon unless necessary.
- Be transparent about your search process.
- If information is conflicting or uncertain, explain different perspectives.
- If you cannot find an answer, say: "I'm sorry, I don't have an answer to that question. It may be helpful to consult BCRC directly at 512-524-2560 / FAX: 512-717-7545."

Remember: Your users can see your entire search process, which helps build trust and understanding."""


def create_message_with_search(question: str):
    """
    Creates a message to the Anthropic API with the web search tool enabled.
    The tool configuration is based on the provided PDF documentation.
    """
    print(f"🔍 Processing question: {question}")

    try:
        # Create the message with the correct tool configuration as per the PDF.
        response = client.messages.create(
            model="claude-3-5-sonnet-latest", # Using the latest model available
            max_tokens=2048, # Increased for potentially longer answers
            system=SYSTEM_PROMPT,
            messages=[
                {
                    "role": "user",
                    "content": question
                }
            ],
            tools=[{
                "type": "web_search_20250305", # Type from the PDF
                "name": "web_search",
                "allowed_domains": [ # Using domain filtering
                    "bcrc.org",
                    "cancer.gov",
                    "nationalbreastcancer.org",
                    "breastcancer.org"
                ],
                "max_uses": 5  # Limit the number of searches per request
            }]
        )
        return response

    except Exception as e:
        print(f"❌ API Request Error: {str(e)}")
        return None

def process_response(response: anthropic.types.Message, question: str):
    """
    Processes the API response into a structured list of elements for display or saving.
    This function does not print; it only transforms data.
    """
    output_elements = []
    if not response or not response.content:
        output_elements.append({'type': 'text', 'content': "No response content to display."})
        return output_elements

    output_elements.append({'type': 'header', 'level': 1, 'content': 'BCRC AI Assistant Report'})
    output_elements.append({'type': 'text', 'content': f"**Question:** {question}"})
    output_elements.append({'type': 'header', 'level': 2, 'content': "📝 Assistant's Process"})

    citation_data = []

    for block in response.content:
        if block.type == 'text':
            output_elements.append({'type': 'text', 'content': f"**Claude:** {block.text}"})
            if hasattr(block, 'citations') and block.citations:
                for citation in block.citations:
                    citation_data.append({
                        'Cited Text': citation.cited_text,
                        'Source URL': citation.url
                    })
        elif block.type == 'server_tool_use' and block.name == 'web_search':
            query_input = block.input.get('query', str(block.input))
            output_elements.append({'type': 'header', 'level': 3, 'content': '🔍 Executing Search'})
            output_elements.append({'type': 'text', 'content': f"**Query:** \"{query_input}\""})

        elif block.type == 'web_search_tool_result':
            output_elements.append({'type': 'header', 'level': 3, 'content': '📚 Sources Found'})
            if isinstance(block.content, list) and block.content:
                results_data = []
                for result in block.content:
                    if result.type == 'web_search_result':
                        results_data.append({
                            'Title': result.title,
                            'Source': result.url,
                            'Last Updated': result.page_age
                        })
                if results_data:
                    df = pd.DataFrame(results_data)
                    output_elements.append({'type': 'dataframe', 'data': df})
            elif hasattr(block.content, 'type') and block.content.type == 'web_search_tool_result_error':
                output_elements.append({'type': 'text', 'content': f"Search Error: {block.content.error_code}"})
            else:
                output_elements.append({'type': 'text', 'content': "No parsable search results found in this block."})

    if citation_data:
        output_elements.append({'type': 'header', 'level': 2, 'content': '📑 Citations Summary'})
        citation_df = pd.DataFrame(citation_data)
        output_elements.append({'type': 'dataframe', 'data': citation_df})

    if response.usage:
        output_elements.append({'type': 'header', 'level': 2, 'content': '📊 Usage Statistics'})
        stats_text = (
            f"* Input tokens: {response.usage.input_tokens}\n"
            f"* Output tokens: {response.usage.output_tokens}"
        )
        if hasattr(response.usage, 'server_tool_use') and response.usage.server_tool_use:
            searches = getattr(response.usage.server_tool_use, 'web_search_requests', 0)
            stats_text += f"\n* Web searches performed: {searches}"
        output_elements.append({'type': 'text', 'content': stats_text})

    return output_elements

def display_on_console(output_elements: List[Dict]):
    """Renders the processed output elements to the console/Colab display."""
    print("\n" + "="*80)
    for element in output_elements:
        if element['type'] == 'header':
            print(f"\n--- {element['content']} ---\n")
        elif element['type'] == 'text':
            # Remove markdown bold for cleaner console printing
            print(element['content'].replace('**', ''))
        elif element['type'] == 'dataframe':
            # Use display(HTML) for rich table formatting in Colab
            display(HTML(element['data'].to_html(index=False, escape=False)))
    print("\n" + "="*80)

def save_as_markdown(output_elements: List[Dict], question: str):
    """Saves the processed output to a Markdown file."""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    # Sanitize question for a safe filename
    safe_question = "".join([c for c in question if c.isalnum() or c.isspace()]).rstrip()
    safe_question = re.sub(r'\s+', '_', safe_question)[:50]
    filename = f"BCRC_Report_{safe_question}_{timestamp}.md"

    markdown_content = []
    for element in output_elements:
        if element['type'] == 'header':
            level = element.get('level', 2)
            markdown_content.append(f"\n{'#' * level} {element['content']}\n")
        elif element['type'] == 'text':
            markdown_content.append(element['content'] + "\n")
        elif element['type'] == 'dataframe':
            df_copy = element['data'].copy()
            # Convert URLs to proper markdown links if the column exists
            for col in ['Source', 'Source URL']:
                 if col in df_copy.columns:
                      df_copy[col] = df_copy[col].apply(lambda x: f'[{x}]({x})' if str(x).startswith('http') else x)
            markdown_content.append(df_copy.to_markdown(index=False) + "\n")

    full_markdown = "".join(markdown_content)
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(full_markdown)
        print(f"✅ Report successfully saved to: {filename}")
    except Exception as e:
        print(f"❌ Error saving file: {e}")

# --- MAIN FUNCTIONS ---

def ask_bcrc_assistant(question: str, save_markdown: bool = True):
    """
    Main function to ask the BCRC assistant a question.
    It orchestrates processing, displaying, and saving the output.
    """
    response = create_message_with_search(question)

    if response:
        # 1. Process the response into a structured format
        processed_output = process_response(response, question)

        # 2. Display the results on the console for immediate feedback
        display_on_console(processed_output)

        # 3. Save the results to a markdown file
        if save_markdown:
            save_as_markdown(processed_output, question)

        return response
    return None

def interactive_ask():
    """Interactive function for asking questions in a loop."""
    while True:
        print("\n" + "="*80)
        question = input("Ask a breast cancer related question (or type 'quit' to exit): ")
        if question.lower() == 'quit':
            print("Exiting interactive session.")
            break
        ask_bcrc_assistant(question, save_markdown=True)


if __name__ == "__main__":
    # Test with a sample question
    sample_question = "What are the early warning signs of breast cancer?"
    print(f"Running single test with question: \"{sample_question}\"")

    result = ask_bcrc_assistant(sample_question)

    print("\nTo start an interactive session, uncomment the line below and run the cell:")
    # interactive_ask()



--- BCRC AI Assistant Report ---


--- 📝 Assistant's Process ---


--- 🔍 Executing Search ---


--- 📚 Sources Found ---



Title,Source,Last Updated
Signs and Symptoms of Breast Cancer,https://www.breastcancer.org/signs-symptoms,"March 29, 2025"
Breast Cancer Signs and Symptoms - National Breast Cancer Foundation,https://www.nationalbreastcancer.org/breast-cancer-symptoms-and-signs/,"April 16, 2025"
Know the Symptoms Guide - National Breast Cancer Foundation,https://www.nationalbreastcancer.org/resources/know-the-symptoms/,"February 6, 2024"
Breast Cancer Early Detection - National Breast Cancer Foundation,https://www.nationalbreastcancer.org/early-detection-of-breast-cancer/,"March 7, 2023"
Inflammatory Breast Cancer - NCI,https://www.cancer.gov/types/breast/ibc-fact-sheet,
"Metastatic Breast Cancer Explained: Symptoms, Diagnosis & More",https://www.nationalbreastcancer.org/metastatic-breast-cancer/,"August 28, 2019"
"Breast Cancer Facts & Stats 2024 - Incidence, Age, Survival, & More",https://www.nationalbreastcancer.org/breast-cancer-facts/,"August 28, 2019"
Breast Self-Exam - National Breast Cancer Foundation,https://www.nationalbreastcancer.org/breast-self-exam/,"April 16, 2025"
"Metastatic Breast Cancer - Symptoms, Diagnosis & More",https://www.breastcancer.org/types/metastatic,"December 29, 2021"
Breast Cancer Stages,https://www.breastcancer.org/pathology-report/breast-cancer-stages,"January 8, 2022"




1. Changes in the Breast or Nearby Areas:

Claude: Common signs include:
- A lump or thickening in or near the breast or underarm area
- Changes in skin texture or enlarged pores (sometimes described as having an orange peel texture)
Claude: 

2. Visual Changes:

Claude: When examining your breasts, look for:
- Changes in the contour or shape of the breasts
- Dimpling
- Swelling
- Skin irregularities
- Changes in the nipples
Claude: 

3. Nipple Changes:

Claude: - Any nipple discharge (clear, bloody, or milky) when not breastfeeding should be checked
- The most concerning types of discharge are bloody or clear
Claude: 

4. Inflammatory Breast Cancer Signs:

Claude: Some specific signs include:
- Rapid onset of redness
- Swelling
- Ridged or pitted skin
- Abnormal breast warmth
- Rapid increase in breast size
- Sensations of heaviness, burning, or tenderness
- Inverted nipple
Claude: 

Important Points to Remember:

1. 
Claude: Most people will initially notice only one or two symptom

Cited Text,Source URL
Nipple tenderness or a lump or thickening in or near the breast or underarm area · A change in the skin texture or an enlargement of pores in the skin...,https://www.nationalbreastcancer.org/breast-cancer-symptoms-and-signs/
"With your arms at your sides, visually inspect your breasts, looking for any changes in the contour or shape of the breasts, any dimpling, swelling, o...",https://www.nationalbreastcancer.org/breast-self-exam/
"It is also important to note that a milky discharge that is present when a woman is not breastfeeding should be checked by her doctor, although it is ...",https://www.nationalbreastcancer.org/breast-cancer-symptoms-and-signs/
"Minimum criteria for a diagnosis of inflammatory breast cancer include the following: A rapid onset of erythema (redness), edema (swelling), and a pea...",https://www.cancer.gov/types/breast/ibc-fact-sheet
"Most people who have breast cancer signs and symptoms will initially notice only one or two, and the presence of these signs and symptoms do not autom...",https://www.nationalbreastcancer.org/breast-cancer-symptoms-and-signs/
"Most often, signs and symptoms are not due to cancer, but any breast cancer sign or symptom you notice should be investigated as soon as it is discove...",https://www.nationalbreastcancer.org/breast-cancer-symptoms-and-signs/
"By performing monthly breast self-exams, you will be able to more easily identify any changes in your breasts. Be sure to talk to your healthcare prof...",https://www.nationalbreastcancer.org/breast-cancer-symptoms-and-signs/
"Mammography can usually detect tumors before they can be felt, so screening is key for early detection. But when combined with regular medical care an...",https://www.nationalbreastcancer.org/breast-self-exam/
"If you find a lump, schedule an appointment with your doctor, but don’t panic — 8 out of 10 lumps are not cancerous. For additional peace of mind, cal...",https://www.nationalbreastcancer.org/breast-self-exam/
"When caught in its earliest, localized stages, the 5-year relative survival rate is 99%. Advances in early detection and treatment methods have signif...",https://www.nationalbreastcancer.org/breast-cancer-facts/



--- 📊 Usage Statistics ---

* Input tokens: 13248
* Output tokens: 804
* Web searches performed: 1


To start an interactive session, uncomment the line below and run the cell:


In [None]:
# BCRC AI Assistant - Comprehensive Version with Debug Capabilities
# For use in Google Colab
# Uses Claude Sonnet 4 (latest model) with full transparency options

import anthropic
import os
import json
import re
from datetime import datetime
from typing import Dict, List, Any, Optional, Tuple
import pandas as pd
from IPython.display import display, HTML, Markdown
from google.colab import userdata

# --- AUTHENTICATION SETUP ---
api_key = userdata.get('ANTHROPIC_API_KEY')
client = anthropic.Anthropic(api_key=api_key)

# --- SYSTEM PROMPTS ---
EMPATHETIC_SYSTEM_PROMPT = """You are an AI assistant specialized in answering questions about breast cancer, built for BCRC (Breast Cancer Resource Center). Your primary goal is to provide accurate, helpful, and deeply empathetic information using verified sources.

Your Persona and Instructions:

Empathetic Tone: Always maintain a conversational, supportive, and empathetic tone. Acknowledge the user's feelings and the difficulty of their situation.
Direct Answers: Provide a direct and cohesive answer. Do NOT show your work or internal thought process. Do not say "I will search for..." or "Based on the search results...". Use your search tool silently to gather information and then present the final, synthesized answer directly to the user.
Cite Sources: If you use information from a search, include all source URLs in a "Sources" list at the very end of your response.
Handling Conflicting Information:

If information from your search is conflicting, uncertain, or still being researched, you must explain the different perspectives clearly and compassionately. This helps users understand the current medical landscape while preserving trust.
Use simple language, avoid technical jargon, and always maintain an empathetic tone. You do not need to resolve the conflict—just present what is known.
Examples:
Topic: Managing fatigue during chemotherapy "Some sources recommend light daily exercise, like walking, to ease fatigue. Others emphasize rest, especially when treatment is more intense. A balance between movement and rest, tailored to how your loved one feels, may work best."
Topic: Diet and cancer recurrence "Some studies suggest that plant-based diets may reduce recurrence risk, while others note the evidence is still developing. It’s a good idea to speak with a dietitian who specializes in cancer care."
Topic: Lymphedema and strength training "There’s some disagreement on whether lifting weights increases lymphedema risk. Older advice recommended avoiding it, but newer studies suggest that supervised strength training may actually help. Consulting a physical therapist familiar with lymphedema is a helpful next step."
If no clear consensus exists, say: “There isn’t a single agreed-upon answer to this yet. You may want to consult a specialist at BCRC or your healthcare provider to discuss what fits best in your situation.”
Context: Top 10 Frequently Asked Questions by Caregivers
Be aware of these common caregiver concerns. If a user's question is similar in context to one of these, use this understanding to provide a particularly relevant and compassionate response.

Supporting a loved one: How to provide emotional/practical support after a diagnosis.
Post-surgery recovery: What to expect and how to help after surgery.
Caregiver's mental health: How to cope with the emotional toll (stress, fear, anxiety).
Managing treatment side effects: Helping with side effects from chemo, hormone therapy, etc.
Learning from other caregivers: Seeking solidarity and coping strategies from peers.
Relationship & intimacy changes: Navigating changes in the relationship during/after cancer.
Finding resources: Locating support groups, forums, and materials for caregivers.
Communicating the diagnosis: How to tell family, friends, and children.
Commemorating milestones: Gift ideas for the end of treatment or cancerversaries.
International treatment access: Helping a loved one from another country get treatment in the U.S."""

TRANSPARENT_SYSTEM_PROMPT = """You are an AI assistant specialized in answering questions about breast cancer, built for BCRC (Breast Cancer Resource Center).

Your Primary Instruction:
To answer the user's question, you MUST use the web_search tool to find information from verified sources. You cannot answer from your own knowledge.

Your Process (Show Your Work):
After you have called the web_search tool and received the results, you must explain your process to the user as follows:

State the Goal: Briefly state what information you were looking for.
Show the Search Query: Tell the user the exact search query you used.
Summarize Findings: Synthesize the information from the search results. You must use specific quotes and cite the sources for the information you provide.
Guidelines:

Maintain a conversational, empathetic tone.
If you cannot find an answer after searching, say: "I'm sorry, I was unable to find an answer to that question using the available tools. It may be helpful to consult BCRC directly at 512-524-2560 / FAX: 512-717-7545."
Be transparent about your process as outlined above.
Handling Conflicting Information:
If information from your search is conflicting, uncertain, or still being researched, you must explain the different perspectives clearly and compassionately. This helps users understand the current medical landscape while preserving trust.
Use simple language, avoid technical jargon, and always maintain an empathetic tone. You do not need to resolve the conflict—just present what is known.
Examples:
Topic: Managing fatigue during chemotherapy "Some sources recommend light daily exercise, like walking, to ease fatigue. Others emphasize rest, especially when treatment is more intense. A balance between movement and rest, tailored to how your loved one feels, may work best."
Topic: Diet and cancer recurrence "Some studies suggest that plant-based diets may reduce recurrence risk, while others note the evidence is still developing. It’s a good idea to speak with a dietitian who specializes in cancer care."
Topic: Lymphedema and strength training "There’s some disagreement on whether lifting weights increases lymphedema risk. Older advice recommended avoiding it, but newer studies suggest that supervised strength training may actually help. Consulting a physical therapist familiar with lymphedema is a helpful next step."
If no clear consensus exists, say: “There isn’t a single agreed-upon answer to this yet. You may want to consult a specialist at BCRC or your healthcare provider to discuss what fits best in your situation.”
"""

class BCRCAssistant:
    def __init__(self, debug_mode: bool = True):
        self.client = client
        self.debug_mode = debug_mode
        self.conversation_history = []

    def log_debug(self, message: str, data: Any = None):
        """Debug logging function"""
        if self.debug_mode:
            print(f"\n🔍 DEBUG: {message}")
            if data:
                if isinstance(data, dict) or isinstance(data, list):
                    print(json.dumps(data, indent=2, default=str))
                else:
                    print(str(data))
            print("-" * 50)

    def create_message_with_search(self, question: str, mode: str = "empathetic") -> Optional[anthropic.types.Message]:
        """
        Creates a message to the Anthropic API with web search enabled.

        Args:
            question: The user's question
            mode: Either "empathetic" (hides process) or "transparent" (shows process)
        """
        system_prompt = EMPATHETIC_SYSTEM_PROMPT if mode == "empathetic" else TRANSPARENT_SYSTEM_PROMPT

        # Log the request being sent
        request_data = {
            "model": "claude-sonnet-4-20250514",
            "max_tokens": 4000,
            "system": system_prompt,
            "messages": [{"role": "user", "content": question}],
            "tools": [{
                "type": "web_search_20250305",
                "name": "web_search",
                "allowed_domains": [
                    "bcrc.org",
                    "cancer.gov",
                    "nationalbreastcancer.org",
                    "breastcancer.org"
                ],
                "max_uses": 5
            }]
        }

        self.log_debug("API Request Details", {
            "model": request_data["model"],
            "system_prompt_length": len(system_prompt),
            "question": question,
            "mode": mode,
            "tools_configured": "web_search with 4 allowed domains"
        })

        try:
            print(f"🚀 Sending request to Claude Sonnet 4...")
            response = self.client.messages.create(**request_data)

            # Log the raw response structure
            self.log_debug("Raw API Response Structure", {
                "response_id": response.id,
                "model": response.model,
                "usage": response.usage.__dict__ if response.usage else None,
                "stop_reason": response.stop_reason,
                "content_blocks_count": len(response.content),
                "content_types": [block.type for block in response.content]
            })

            return response

        except Exception as e:
            print(f"❌ API Request Error: {str(e)}")
            self.log_debug("API Error Details", str(e))
            return None

    def analyze_response_content(self, response: anthropic.types.Message) -> Dict[str, Any]:
        """
        Deeply analyzes the response content to extract all information.
        """
        analysis = {
            "text_blocks": [],
            "search_queries": [],
            "search_results": [],
            "citations": [],
            "tool_uses": [],
            "errors": [],
            "sources_found": []
        }

        for i, block in enumerate(response.content):
            self.log_debug(f"Processing block {i}", {
                "type": block.type,
                "attributes": [attr for attr in dir(block) if not attr.startswith('_')]
            })

            if block.type == 'text':
                analysis["text_blocks"].append({
                    "index": i,
                    "text": block.text,
                    "length": len(block.text)
                })

                # Extract sources from text (if they're listed at the end)
                text = block.text
                if "Sources:" in text or "**Sources:**" in text:
                    # Extract URLs from the sources section
                    import re
                    urls = re.findall(r'https?://[^\s\)]+', text)
                    analysis["sources_found"].extend(urls)

            elif block.type == 'tool_use':
                tool_info = {
                    "index": i,
                    "tool_name": getattr(block, 'name', 'unknown'),
                    "tool_id": getattr(block, 'id', 'unknown'),
                    "input": getattr(block, 'input', {})
                }
                analysis["tool_uses"].append(tool_info)

                if getattr(block, 'name', '') == 'web_search':
                    query = block.input.get('query', str(block.input)) if hasattr(block, 'input') else 'unknown query'
                    analysis["search_queries"].append(query)

            elif block.type == 'tool_result':
                self.log_debug(f"Tool result block", {
                    "has_content": hasattr(block, 'content'),
                    "content_type": type(getattr(block, 'content', None)),
                    "tool_use_id": getattr(block, 'tool_use_id', 'unknown')
                })

                if hasattr(block, 'content'):
                    if isinstance(block.content, list):
                        for result in block.content:
                            if hasattr(result, 'type') and result.type == 'web_search_result':
                                search_result = {
                                    "title": getattr(result, 'title', 'No title'),
                                    "url": getattr(result, 'url', 'No URL'),
                                    "snippet": getattr(result, 'snippet', 'No snippet'),
                                    "page_age": getattr(result, 'page_age', 'Unknown age')
                                }
                                analysis["search_results"].append(search_result)
                    elif hasattr(block.content, 'type') and block.content.type == 'web_search_result':
                        search_result = {
                            "title": getattr(block.content, 'title', 'No title'),
                            "url": getattr(block.content, 'url', 'No URL'),
                            "snippet": getattr(block.content, 'snippet', 'No snippet'),
                            "page_age": getattr(block.content, 'page_age', 'Unknown age')
                        }
                        analysis["search_results"].append(search_result)

            # Handle other possible block types
            else:
                self.log_debug(f"Unknown block type: {block.type}", block)

        return analysis

    def create_search_quality_audit(self, analysis: Dict[str, Any], question: str) -> pd.DataFrame:
        """
        Creates a table to audit search quality - did we search the right sites and find relevant content?
        """
        search_audit_data = []

        # Get all sources (from search results and from final response)
        all_sources = []

        # From search results
        for result in analysis["search_results"]:
            all_sources.append({
                "url": result["url"],
                "title": result["title"],
                "snippet": result["snippet"],
                "source_type": "Search Result"
            })

        # From sources listed in response
        for url in analysis["sources_found"]:
            # Avoid duplicates
            if not any(s["url"] == url for s in all_sources):
                all_sources.append({
                    "url": url,
                    "title": "Listed in Response",
                    "snippet": "Source cited in final response",
                    "source_type": "Response Citation"
                })

        for source in all_sources:
            # Determine if it's an allowed/trusted domain
            trusted_domains = ["bcrc.org", "cancer.gov", "nationalbreastcancer.org", "breastcancer.org"]
            is_trusted = any(domain in source["url"] for domain in trusted_domains)

            # Assess relevance based on title and snippet content
            question_keywords = question.lower().split()
            content_to_check = f"{source['title']} {source['snippet']}".lower()

            keyword_matches = sum(1 for keyword in question_keywords if keyword in content_to_check)
            relevance_score = f"{keyword_matches}/{len(question_keywords)}"

            # Determine relevance level
            if keyword_matches >= len(question_keywords) * 0.7:
                relevance = "High"
            elif keyword_matches >= len(question_keywords) * 0.4:
                relevance = "Medium"
            else:
                relevance = "Low"

            search_audit_data.append({
                "Source URL": source["url"],
                "Domain": source["url"].split("/")[2] if "/" in source["url"] else "Unknown",
                "Is Trusted Domain": "✅" if is_trusted else "❌",
                "Source Type": source["source_type"],
                "Title": source["title"][:50] + "..." if len(source["title"]) > 50 else source["title"],
                "Keyword Matches": relevance_score,
                "Relevance": relevance,
                "Contains Answer": "🔍 Manual Review Needed"  # This requires human judgment
            })

        return pd.DataFrame(search_audit_data) if search_audit_data else pd.DataFrame(columns=[
            "Source URL", "Domain", "Is Trusted Domain", "Source Type", "Title", "Keyword Matches", "Relevance", "Contains Answer"
        ])

    def create_response_quality_audit(self, response: anthropic.types.Message, analysis: Dict[str, Any], question: str) -> pd.DataFrame:
        """
        Creates a table to audit response quality - did the LLM answer correctly and comprehensively?
        """
        # Extract the main response text
        response_text = ""
        for block in analysis["text_blocks"]:
            response_text += block["text"] + " "

        # Define quality criteria
        quality_criteria = [
            {
                "Criteria": "Answered the Question",
                "Description": "Response directly addresses what was asked",
                "Assessment": "🔍 Manual Review Needed",
                "Notes": "Check if early warning signs were provided"
            },
            {
                "Criteria": "Used Trusted Sources",
                "Description": "Information came from reliable medical sources",
                "Assessment": "✅ Yes" if analysis["sources_found"] or analysis["search_results"] else "❌ No",
                "Notes": f"Found {len(analysis['sources_found']) + len(analysis['search_results'])} sources"
            },
            {
                "Criteria": "Provided Specific Information",
                "Description": "Gave concrete, actionable details",
                "Assessment": "✅ Yes" if len(response_text) > 500 else "⚠️ Limited",
                "Notes": f"Response length: {len(response_text)} characters"
            },
            {
                "Criteria": "Included Proper Citations",
                "Description": "Sources were properly referenced",
                "Assessment": "✅ Yes" if "Sources:" in response_text or analysis["sources_found"] else "❌ No",
                "Notes": f"Found {len(analysis['sources_found'])} cited sources"
            },
            {
                "Criteria": "Maintained Empathetic Tone",
                "Description": "Response was supportive and understanding",
                "Assessment": "🔍 Manual Review Needed",
                "Notes": "Check for empathetic language and supportive phrasing"
            },
            {
                "Criteria": "Avoided Medical Advice",
                "Description": "Didn't provide diagnosis or treatment advice",
                "Assessment": "🔍 Manual Review Needed",
                "Notes": "Should provide information but direct to healthcare providers"
            },
            {
                "Criteria": "Accuracy Check",
                "Description": "Information is medically accurate",
                "Assessment": "🔍 Expert Review Required",
                "Notes": "Requires medical professional validation"
            },
            {
                "Criteria": "Completeness",
                "Description": "Covered all major early warning signs",
                "Assessment": "🔍 Manual Review Needed",
                "Notes": "Check against comprehensive symptom lists"
            }
        ]

        return pd.DataFrame(quality_criteria)

    def display_detailed_analysis(self, response: anthropic.types.Message, question: str, mode: str):
        """
        Displays a comprehensive analysis of the response.
        """
        print("\n" + "="*100)
        print(f"🎯 BCRC AI ASSISTANT - DETAILED ANALYSIS")
        print(f"📅 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"🤖 Model: Claude Sonnet 4")
        print(f"🔧 Mode: {mode.title()}")
        print("="*100)

        # Question
        print(f"\n❓ **QUESTION:**")
        print(f"   {question}")

        # Usage statistics
        if response.usage:
            print(f"\n📊 **TOKEN USAGE:**")
            print(f"   Input tokens: {response.usage.input_tokens:,}")
            print(f"   Output tokens: {response.usage.output_tokens:,}")
            print(f"   Total tokens: {response.usage.input_tokens + response.usage.output_tokens:,}")

        # Detailed content analysis
        analysis = self.analyze_response_content(response)

        # Search queries performed
        if analysis["search_queries"]:
            print(f"\n🔍 **SEARCH QUERIES PERFORMED:**")
            for i, query in enumerate(analysis["search_queries"], 1):
                print(f"   {i}. \"{query}\"")

        # Search results found
        if analysis["search_results"]:
            print(f"\n📚 **SEARCH RESULTS FOUND:**")
            for i, result in enumerate(analysis["search_results"], 1):
                print(f"\n   Result {i}:")
                print(f"   📰 Title: {result['title']}")
                print(f"   🔗 URL: {result['url']}")
                print(f"   📄 Snippet: {result['snippet'][:150]}...")
                print(f"   📅 Page Age: {result['page_age']}")

        # Final assistant response
        final_text = ""
        for block in analysis["text_blocks"]:
            final_text += block["text"] + "\n"

        if final_text:
            print(f"\n🤖 **ASSISTANT'S FINAL RESPONSE:**")
            print("-" * 60)
            display(Markdown(final_text))
            print("-" * 60)

        # Quality Audit Tables
        print(f"\n📊 **SEARCH QUALITY AUDIT:**")
        print("(Evaluates: Did we search the right sites? Do they contain the answer?)")
        search_audit_df = self.create_search_quality_audit(analysis, question)
        if not search_audit_df.empty:
            display(HTML(search_audit_df.to_html(index=False, escape=False)))
        else:
            print("No search data available for audit")

        print(f"\n📋 **RESPONSE QUALITY AUDIT:**")
        print("(Evaluates: Did the LLM answer correctly and appropriately?)")
        response_audit_df = self.create_response_quality_audit(response, analysis, question)
        display(HTML(response_audit_df.to_html(index=False, escape=False)))

        # Raw response data (if debug mode)
        if self.debug_mode:
            print(f"\n🔬 **RAW RESPONSE DATA:**")
            self.log_debug("Complete Response Analysis", analysis)

    def save_comprehensive_report(self, response: anthropic.types.Message, question: str, mode: str) -> str:
        """
        Saves a comprehensive markdown report with all details.
        """
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        safe_question = re.sub(r'[^\w\s-]', '', question).strip()
        safe_question = re.sub(r'[-\s]+', '_', safe_question)[:50]
        filename = f"test_{safe_question}_{timestamp}.md"

        analysis = self.analyze_response_content(response)

        # Build markdown content
        markdown_content = [
            f"# BCRC AI Assistant - Comprehensive Report\n",
            f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  ",
            f"**Model:** Claude Sonnet 4  ",
            f"**Mode:** {mode.title()}  \n",
            f"## Question\n{question}\n",
        ]

        # Usage statistics
        if response.usage:
            markdown_content.extend([
                f"## Token Usage\n",
                f"- **Input tokens:** {response.usage.input_tokens:,}",
                f"- **Output tokens:** {response.usage.output_tokens:,}",
                f"- **Total tokens:** {response.usage.input_tokens + response.usage.output_tokens:,}\n"
            ])

        # Search process
        if analysis["search_queries"]:
            markdown_content.append("## Search Process\n")
            for i, query in enumerate(analysis["search_queries"], 1):
                markdown_content.append(f"### Search {i}\n**Query:** `{query}`\n")

        # Search results
        if analysis["search_results"]:
            markdown_content.append("## Search Results\n")
            for i, result in enumerate(analysis["search_results"], 1):
                markdown_content.extend([
                    f"### Result {i}\n",
                    f"- **Title:** {result['title']}",
                    f"- **URL:** [{result['url']}]({result['url']})",
                    f"- **Snippet:** {result['snippet']}",
                    f"- **Page Age:** {result['page_age']}\n"
                ])

        # Quality audit tables
        search_audit_df = self.create_search_quality_audit(analysis, question)
        response_audit_df = self.create_response_quality_audit(response, analysis, question)

        if not search_audit_df.empty:
            markdown_content.extend([
                "## Search Quality Audit\n",
                "*Evaluates: Did we search the right sites? Do they contain the answer?*\n",
                search_audit_df.to_markdown(index=False) + "\n"
            ])

        markdown_content.extend([
            "## Response Quality Audit\n",
            "*Evaluates: Did the LLM answer correctly and appropriately?*\n",
            response_audit_df.to_markdown(index=False) + "\n"
        ])

        # Final response
        final_text = ""
        for block in analysis["text_blocks"]:
            final_text += block["text"] + "\n"

        if final_text:
            markdown_content.extend([
                "## Assistant's Response\n",
                final_text
            ])

        # Save file
        full_content = "\n".join(markdown_content)
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(full_content)
            print(f"✅ Comprehensive report saved: {filename}")
            return filename
        except Exception as e:
            print(f"❌ Error saving report: {e}")
            return None

    def ask_question(self, question: str, mode: str = "empathetic", save_report: bool = True) -> Optional[anthropic.types.Message]:
        """
        Main method to ask a question with full analysis.

        Args:
            question: The user's question
            mode: "empathetic" (direct answers) or "transparent" (shows process)
            save_report: Whether to save a comprehensive report
        """
        print(f"🎯 Processing question in {mode} mode...")

        # Get response from Claude
        response = self.create_message_with_search(question, mode)

        if response:
            # Display detailed analysis
            self.display_detailed_analysis(response, question, mode)

            # Save comprehensive report
            if save_report:
                self.save_comprehensive_report(response, question, mode)

            return response

        return None

# --- CONVENIENCE FUNCTIONS ---

def ask_empathetic(question: str, debug: bool = True) -> Optional[anthropic.types.Message]:
    """Ask a question in empathetic mode (hides search process)"""
    assistant = BCRCAssistant(debug_mode=debug)
    return assistant.ask_question(question, mode="empathetic")

def ask_transparent(question: str, debug: bool = True) -> Optional[anthropic.types.Message]:
    """Ask a question in transparent mode (shows search process)"""
    assistant = BCRCAssistant(debug_mode=debug)
    return assistant.ask_question(question, mode="transparent")

def interactive_session():
    """Start an interactive session with mode selection"""
    assistant = BCRCAssistant(debug_mode=True)

    print("🎯 BCRC AI Assistant - Interactive Session")
    print("Available modes:")
    print("  1. Empathetic (direct, compassionate answers)")
    print("  2. Transparent (shows search process)")

    while True:
        print("\n" + "="*80)
        question = input("Ask a breast cancer related question (or 'quit' to exit): ")
        if question.lower() == 'quit':
            print("👋 Exiting interactive session.")
            break

        mode_choice = input("Choose mode (1=empathetic, 2=transparent, default=1): ").strip()
        mode = "transparent" if mode_choice == "2" else "empathetic"

        assistant.ask_question(question, mode=mode, save_report=True)

# --- EXAMPLE USAGE ---
if __name__ == "__main__":
    print("🚀 BCRC AI Assistant initialized with Claude Sonnet 4")
    print("📋 Available functions:")
    print("  - ask_empathetic(question)")
    print("  - ask_transparent(question)")
    print("  - interactive_session()")

    # Example usage
    sample_question = "My loved one is going through chemotherapy and is dealing with extreme fatigue. I've heard some people say they should rest as much as possible, while others suggest that light exercise can help. What is the best way to manage chemo-related fatigue?"
    print(f"\n🧪 Testing with sample question: '{sample_question}'")

    # Test both modes
    print("\n" + "="*50)
    print("TESTING EMPATHETIC MODE")
    print("="*50)
    ask_empathetic(sample_question)

    print("\n" + "="*50)
    print("TESTING TRANSPARENT MODE")
    print("="*50)
    ask_transparent(sample_question)

    print("\n💡 To start interactive session, run: interactive_session()")

🚀 BCRC AI Assistant initialized with Claude Sonnet 4
📋 Available functions:
  - ask_empathetic(question)
  - ask_transparent(question)
  - interactive_session()

🧪 Testing with sample question: 'My loved one is going through chemotherapy and is dealing with extreme fatigue. I've heard some people say they should rest as much as possible, while others suggest that light exercise can help. What is the best way to manage chemo-related fatigue?'

TESTING EMPATHETIC MODE
🎯 Processing question in empathetic mode...

🔍 DEBUG: API Request Details
{
  "model": "claude-sonnet-4-20250514",
  "system_prompt_length": 3429,
  "question": "My loved one is going through chemotherapy and is dealing with extreme fatigue. I've heard some people say they should rest as much as possible, while others suggest that light exercise can help. What is the best way to manage chemo-related fatigue?",
  "mode": "empathetic",
  "tools_configured": "web_search with 4 allowed domains"
}
-------------------------------

I completely understand your concern about managing your loved one's extreme fatigue during chemotherapy. This is one of the most challenging aspects of cancer treatment for both patients and caregivers, and the conflicting advice you've heard is actually quite common.

The truth is that both approaches have merit, and research shows that 
in the past, clinicians typically advised their cancer patients to rest and avoid physical activity. However, what we learned from early exercise research in the 1990s and 2000s contradicted that advice
. The current evidence strongly supports a balanced approach that combines gentle movement with adequate rest.

Here's what the research tells us about managing chemo-related fatigue:

**Light exercise can be incredibly helpful.** 
Diagnosed women who exercised while being treated with chemotherapy had less fatigue, nausea, and pain, as well as better physical fitness than women who didn't exercise
. Even something as simple as 
a half hour of aerobic exercise three times weekly was sufficient to improve anxiety, depression, fatigue, quality of life, and physical function in cancer survivors
.

**Rest is still important.** 
Self-care practices such as yoga, exercise, eating well, and making time to rest can help increase your energy level
. The key is finding the right balance - 
if you are tired, take short naps of less than one hour during the day. Keep in mind that sleeping too much during the day can make it difficult to sleep well at night
.

**What works best is individualized.** 
Choose activities that are most important to you and do them when you have the most energy
. Some days your loved one might feel up for a gentle walk, while other days rest might be exactly what they need.

The most important thing is to work with the healthcare team to create a plan that feels manageable. 
Tell your health care team if you feel very weak and tired—especially if you are not able to do your normal activities or are still very tired even after resting or sleeping. This information, together with medical tests, can help your doctor determine the best way for you to manage or cope with fatigue
.

Remember, 
more than 80% of people with cancer experience fatigue while receiving chemotherapy
, so your loved one is not alone in this struggle. Be patient with both of you as you figure out what works best on any given day.

**Sources:**
- https://www.cancer.gov/about-cancer/treatment/side-effects/fatigue
- https://www.cancer.gov/about-cancer/treatment/side-effects/fatigue/fatigue-hp-pdq
- https://www.breastcancer.org/research-news/exercise-during-chemo-reduces-side-effects
- https://www.breastcancer.org/research-news/exercise-helps-ease-fatigue-and-chemo-brain
- https://www.cancer.gov/news-events/cancer-currents-blog/2019/cancer-survivors-exercise-guidelines-schmitz
- https://www.breastcancer.org/research-news/best-exercise-cancer-fatigue
- https://www.cancer.gov/news-events/cancer-currents-blog/2024/cancer-survivors-minority-underserved-improving-activity
- https://prevention.cancer.gov/clinical-trials/clinical-trials-search/nct00924651
- https://www.breastcancer.org/treatment-side-effects/fatigue


------------------------------------------------------------

📊 **SEARCH QUALITY AUDIT:**
(Evaluates: Did we search the right sites? Do they contain the answer?)


Source URL,Domain,Is Trusted Domain,Source Type,Title,Keyword Matches,Relevance,Contains Answer
https://www.cancer.gov/about-cancer/treatment/side-effects/fatigue,www.cancer.gov,✅,Response Citation,Listed in Response,3/42,Low,🔍 Manual Review Needed
https://www.cancer.gov/about-cancer/treatment/side-effects/fatigue/fatigue-hp-pdq,www.cancer.gov,✅,Response Citation,Listed in Response,3/42,Low,🔍 Manual Review Needed
https://www.breastcancer.org/research-news/exercise-during-chemo-reduces-side-effects,www.breastcancer.org,✅,Response Citation,Listed in Response,3/42,Low,🔍 Manual Review Needed
https://www.breastcancer.org/research-news/exercise-helps-ease-fatigue-and-chemo-brain,www.breastcancer.org,✅,Response Citation,Listed in Response,3/42,Low,🔍 Manual Review Needed
https://www.cancer.gov/news-events/cancer-currents-blog/2019/cancer-survivors-exercise-guidelines-schmitz,www.cancer.gov,✅,Response Citation,Listed in Response,3/42,Low,🔍 Manual Review Needed
https://www.breastcancer.org/research-news/best-exercise-cancer-fatigue,www.breastcancer.org,✅,Response Citation,Listed in Response,3/42,Low,🔍 Manual Review Needed
https://www.cancer.gov/news-events/cancer-currents-blog/2024/cancer-survivors-minority-underserved-improving-activity,www.cancer.gov,✅,Response Citation,Listed in Response,3/42,Low,🔍 Manual Review Needed
https://prevention.cancer.gov/clinical-trials/clinical-trials-search/nct00924651,prevention.cancer.gov,✅,Response Citation,Listed in Response,3/42,Low,🔍 Manual Review Needed
https://www.breastcancer.org/treatment-side-effects/fatigue,www.breastcancer.org,✅,Response Citation,Listed in Response,3/42,Low,🔍 Manual Review Needed



📋 **RESPONSE QUALITY AUDIT:**
(Evaluates: Did the LLM answer correctly and appropriately?)


Criteria,Description,Assessment,Notes
Answered the Question,Response directly addresses what was asked,🔍 Manual Review Needed,Check if early warning signs were provided
Used Trusted Sources,Information came from reliable medical sources,✅ Yes,Found 9 sources
Provided Specific Information,"Gave concrete, actionable details",✅ Yes,Response length: 3181 characters
Included Proper Citations,Sources were properly referenced,✅ Yes,Found 9 cited sources
Maintained Empathetic Tone,Response was supportive and understanding,🔍 Manual Review Needed,Check for empathetic language and supportive phrasing
Avoided Medical Advice,Didn't provide diagnosis or treatment advice,🔍 Manual Review Needed,Should provide information but direct to healthcare providers
Accuracy Check,Information is medically accurate,🔍 Expert Review Required,Requires medical professional validation
Completeness,Covered all major early warning signs,🔍 Manual Review Needed,Check against comprehensive symptom lists



🔬 **RAW RESPONSE DATA:**

🔍 DEBUG: Complete Response Analysis
{
  "text_blocks": [
    {
      "index": 2,
      "text": "I completely understand your concern about managing your loved one's extreme fatigue during chemotherapy. This is one of the most challenging aspects of cancer treatment for both patients and caregivers, and the conflicting advice you've heard is actually quite common.\n\nThe truth is that both approaches have merit, and research shows that ",
      "length": 341
    },
    {
      "index": 3,
      "text": "in the past, clinicians typically advised their cancer patients to rest and avoid physical activity. However, what we learned from early exercise research in the 1990s and 2000s contradicted that advice",
      "length": 202
    },
    {
      "index": 4,
      "text": ". The current evidence strongly supports a balanced approach that combines gentle movement with adequate rest.\n\nHere's what the research tells us about managing chemo-related fatigue:\n\n**Lig

**Goal:** I searched for information about managing chemotherapy-related fatigue, specifically looking at the balance between rest and exercise recommendations.

**Search Query:** "chemotherapy fatigue management rest exercise"

**Findings:**

Based on the search results, there's actually good evidence to support both approaches—but they work best together rather than as an either/or choice. Here's what the research shows:

**The Case for Light Exercise:**

Diagnosed women who exercised while being treated with chemotherapy had less fatigue, nausea, and pain, as well as better physical fitness than women who didn't exercise.
 The National Cancer Institute notes that 
self-care practices such as yoga, exercise, eating well, and making time to rest can help increase your energy level.


Research from the American College of Sports Medicine found that 
an exercise program consisting of a half hour of aerobic exercise three times weekly was sufficient to improve anxiety, depression, fatigue, quality of life, and physical function in cancer survivors.
 This represents a significant shift from earlier medical advice, as 
in the past, clinicians typically advised their cancer patients to rest and avoid physical activity. However, what we learned from early exercise research in the 1990s and 2000s contradicted that advice.


**The Importance of Rest:**
At the same time, rest remains crucial. 
Plan time to rest. If you are tired, take short naps of less than one hour during the day.
 However, it's important to note that 
sleeping too much during the day can make it difficult to sleep well at night.


**The Balanced Approach:**
The evidence suggests that combining both strategies works best. The key insight is that 
cancer fatigue is not completely relieved by sleep or rest and may happen after no or minimal activity.
 This means that while rest is important, gentle movement can actually help combat the fatigue rather than worsen it.

**Practical Recommendations:**
- Start small: 
Since many cancer survivors are sedentary, the first and most important message providers can give their patients is that they don't need to become marathoners to reap the benefits of exercise. Going from no exercise to some exercise will be useful for their health.

- Listen to the body: 
Choose activities that are most important to you and do them when you have the most energy.

- Always coordinate with the healthcare team: 
Talk with your health care team about things you can try.


The best approach appears to be finding a balance that includes both adequate rest and gentle, appropriate exercise—tailored to how your loved one feels on any given day and always in consultation with their medical team.


------------------------------------------------------------

📊 **SEARCH QUALITY AUDIT:**
(Evaluates: Did we search the right sites? Do they contain the answer?)
No search data available for audit

📋 **RESPONSE QUALITY AUDIT:**
(Evaluates: Did the LLM answer correctly and appropriately?)


Criteria,Description,Assessment,Notes
Answered the Question,Response directly addresses what was asked,🔍 Manual Review Needed,Check if early warning signs were provided
Used Trusted Sources,Information came from reliable medical sources,❌ No,Found 0 sources
Provided Specific Information,"Gave concrete, actionable details",✅ Yes,Response length: 2719 characters
Included Proper Citations,Sources were properly referenced,❌ No,Found 0 cited sources
Maintained Empathetic Tone,Response was supportive and understanding,🔍 Manual Review Needed,Check for empathetic language and supportive phrasing
Avoided Medical Advice,Didn't provide diagnosis or treatment advice,🔍 Manual Review Needed,Should provide information but direct to healthcare providers
Accuracy Check,Information is medically accurate,🔍 Expert Review Required,Requires medical professional validation
Completeness,Covered all major early warning signs,🔍 Manual Review Needed,Check against comprehensive symptom lists



🔬 **RAW RESPONSE DATA:**

🔍 DEBUG: Complete Response Analysis
{
  "text_blocks": [
    {
      "index": 2,
      "text": "**Goal:** I searched for information about managing chemotherapy-related fatigue, specifically looking at the balance between rest and exercise recommendations.\n\n**Search Query:** \"chemotherapy fatigue management rest exercise\"\n\n**Findings:**\n\nBased on the search results, there's actually good evidence to support both approaches\u2014but they work best together rather than as an either/or choice. Here's what the research shows:\n\n**The Case for Light Exercise:**\n",
      "length": 461
    },
    {
      "index": 3,
      "text": "Diagnosed women who exercised while being treated with chemotherapy had less fatigue, nausea, and pain, as well as better physical fitness than women who didn't exercise.",
      "length": 170
    },
    {
      "index": 4,
      "text": " The National Cancer Institute notes that ",
      "length": 42
    },
    {
      "index": 

In [None]:
!pip install sentence-transformers faiss-cpu nltk

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [None]:
!pip install anthropic

Collecting anthropic
  Downloading anthropic-0.54.0-py3-none-any.whl.metadata (25 kB)
Downloading anthropic-0.54.0-py3-none-any.whl (288 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/288.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.8/288.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.54.0


In [None]:
# --- SYSTEM PROMPTS ---
EMPATHETIC_SYSTEM_PROMPT = """You are an AI assistant specialized in answering questions about breast cancer, built for BCRC (Breast Cancer Resource Center). Your primary goal is to provide accurate, helpful, and deeply empathetic information using verified sources.

Your Persona and Instructions:

Empathetic Tone: Always maintain a conversational, supportive, and empathetic tone. Acknowledge the user's feelings and the difficulty of their situation.
Direct Answers: Provide a direct and cohesive answer. Do NOT show your work or internal thought process. Do not say "I will search for..." or "Based on the search results...". Use your search tools silently to gather information and then present the final, synthesized answer directly to the user.
Information Gathering Process:
First, use web_search to find current, authoritative information.
Then, use search_kb to verify and supplement with BCRC's internal knowledge base.
Synthesize information from both sources for a comprehensive answer.
Cite Sources: If you use information from a search, include all source URLs in a "Sources" list at the very end of your response.
Handling Conflicting Information: If information from your search is conflicting, uncertain, or still being researched, you must explain the different perspectives clearly and compassionately. This helps users understand the current medical landscape while preserving trust.
Use simple language, avoid technical jargon, and always maintain an empathetic tone. You do not need to resolve the conflict—just present what is known.
Examples:
Topic: Managing fatigue during chemotherapy "Some sources recommend light daily exercise, like walking, to ease fatigue. Others emphasize rest, especially when treatment is more intense. A balance between movement and rest, tailored to how your loved one feels, may work best."
Topic: Diet and cancer recurrence "Some studies suggest that plant-based diets may reduce recurrence risk, while others note the evidence is still developing. It’s a good idea to speak with a dietitian who specializes in cancer care."
Topic: Lymphedema and strength training "There’s some disagreement on whether lifting weights increases lymphedema risk. Older advice recommended avoiding it, but newer studies suggest that supervised strength training may actually help. Consulting a physical therapist familiar with lymphedema is a helpful next step."
If no clear consensus exists, say: “There isn’t a single agreed-upon answer to this yet. You may want to consult a specialist at BCRC or your healthcare provider to discuss what fits best in your situation.”
Context: Top 10 Frequently Asked Questions by Caregivers
Be aware of these common caregiver concerns. If a user's question is similar in context to one of these, use this understanding to provide a particularly relevant and compassionate response.

Supporting a loved one: How to provide emotional/practical support after a diagnosis.
Post-surgery recovery: What to expect and how to help after surgery.
Caregiver's mental health: How to cope with the emotional toll (stress, fear, anxiety).
Managing treatment side effects: Helping with side effects from chemo, hormone therapy, etc.
Learning from other caregivers: Seeking solidarity and coping strategies from peers.
Relationship & intimacy changes: Navigating changes in the relationship during/after cancer.
Finding resources: Locating support groups, forums, and materials for caregivers.
Communicating the diagnosis: How to tell family, friends, and children.
Commemorating milestones: Gift ideas for the end of treatment or cancerversaries.
International treatment access: Helping a loved one from another country get treatment in the U.S."""

TRANSPARENT_SYSTEM_PROMPT = """You are an AI assistant specialized in answering questions about breast cancer, built for BCRC (Breast Cancer Resource Center).

Your Primary Instruction:
To answer the user's question, you MUST use your available tools to find information from verified sources. You cannot answer from your own knowledge.

Your Process (Show Your Work):
After you have called your tools and received the results, you must explain your process to the user as follows:

State the Goal: Briefly state what information you were looking for.
Show the Search Queries: Tell the user the exact search queries you used for web_search and/or search_kb.
Summarize Findings: Synthesize the information from the search results. You must use specific quotes and cite the sources for the information you provide. Clearly distinguish between information found via web search and BCRC's internal knowledge base.
Information Gathering Process:

First, use web_search to find current, authoritative information.
Then, use search_kb to verify and supplement with BCRC's internal knowledge base.
Synthesize information from both sources for a comprehensive answer.
Guidelines:

Maintain a conversational, empathetic tone.
If you cannot find an answer after searching, say: "I'm sorry, I was unable to find an answer to that question using the available tools. It may be helpful to consult BCRC directly at 512-524-2560 / FAX: 512-717-7545."
Be transparent about your process as outlined above.
Handling Conflicting Information:
If information from your searches is conflicting, uncertain, or still being researched, you must explain the different perspectives clearly and compassionately. This helps users understand the current medical landscape while preserving trust.

Use simple language, avoid technical jargon, and always maintain an empathetic tone. You do not need to resolve the conflict—just present what is known.
Examples:
Topic: Managing fatigue during chemotherapy "Some sources recommend light daily exercise, like walking, to ease fatigue. Others emphasize rest, especially when treatment is more intense. A balance between movement and rest, tailored to how your loved one feels, may work best."
Topic: Diet and cancer recurrence "Some studies suggest that plant-based diets may reduce recurrence risk, while others note the evidence is still developing. It’s a good idea to speak with a dietitian who specializes in cancer care."
Topic: Lymphedema and strength training "There’s some disagreement on whether lifting weights increases lymphedema risk. Older advice recommended avoiding it, but newer studies suggest that supervised strength training may actually help. Consulting a physical therapist familiar with lymphedema is a helpful next step."
If no clear consensus exists, say: “There isn’t a single agreed-upon answer to this yet. You may want to consult a specialist at BCRC or your healthcare provider to discuss what fits best in your situation.”"""


In [None]:
# BCRC AI Assistant - Comprehensive Version with Pre-built Knowledge Base
# For use in Google Colab
# Uses Claude Sonnet 4 (latest model) with web search + pre-built knowledge base

import anthropic
import os
import json
import re
from datetime import datetime
from typing import Dict, List, Any, Optional, Tuple
import pandas as pd
from IPython.display import display, HTML, Markdown
from google.colab import userdata
import pickle
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sentence_transformers import SentenceTransformer
from typing import Dict, List

# Install required packages only for loading saved data
# !pip install faiss-cpu

try:
    import faiss
    print("✅ Required packages loaded successfully")
except ImportError as e:
    print(f"❌ Missing packages. Please run: !pip install faiss-cpu")
    print(f"Error: {e}")

# --- KNOWLEDGE BASE CLASS ---
class KnowledgeBase:
    def __init__(self, kb_store_path: str = "/content/drive/MyDrive/BCRC/knowledge_base_store", model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the knowledge base by loading from pre-saved files
        and also loading the model required for searching.
        """
        self.kb_store_path = kb_store_path
        self.chunks = []
        self.embeddings = None
        self.index = None
        self.is_ready = False

        # --- FIX 1: Initialize the sentence model ---
        # The model is required to embed incoming search queries.
        print(f"🧠 Initializing sentence transformer model: {model_name}")
        self.model = SentenceTransformer(model_name)

        # Load the data from disk
        print(f"💾 Loading Knowledge Base data from: {kb_store_path}")
        self._load_from_disk()

    def _load_from_disk(self):
        """
        (CORRECTED)
        Load the pre-built knowledge base from saved files with correct filenames.
        """
        try:
            if not os.path.isdir(self.kb_store_path):
                print(f"❌ Knowledge base directory not found: {self.kb_store_path}")
                return

            # --- FIX 2: Correct filenames and loading method ---
            # Using .json for chunks as shown in your screenshot
            chunks_file = os.path.join(self.kb_store_path, "chunks.json")
            if os.path.exists(chunks_file):
                with open(chunks_file, 'r') as f:
                    self.chunks = json.load(f) # Use json.load for .json files
                print(f"✅ Loaded {len(self.chunks)} chunks")
            else:
                print(f"❌ Chunks file not found: {chunks_file}"); return

            # Embeddings file is correct
            embeddings_file = os.path.join(self.kb_store_path, "embeddings.npy")
            if os.path.exists(embeddings_file):
                self.embeddings = np.load(embeddings_file)
                print(f"✅ Loaded embeddings: {self.embeddings.shape}")
            else:
                print(f"❌ Embeddings file not found: {embeddings_file}"); return

            # Using .index for the FAISS index as shown in your screenshot
            index_file = os.path.join(self.kb_store_path, "kb.index")
            if os.path.exists(index_file):
                self.index = faiss.read_index(index_file)
                print(f"✅ Loaded FAISS index: {self.index.ntotal} vectors")
            else:
                print(f"❌ FAISS index file not found: {index_file}"); return

            self.is_ready = True
            print(f"🎯 Knowledge Base is ready!")

        except Exception as e:
            print(f"❌ Error during knowledge base loading: {e}")
            self.is_ready = False

    def search(self, query: str, top_k: int = 5, similarity_threshold: float = 0.3) -> List[Dict]:
        """
        (CORRECTED)
        Embeds the query and performs a true similarity search using FAISS.
        """
        if not self.is_ready:
            print("⚠️ Search attempted but KB is not ready.")
            return []

        try:
            # --- FIX 3: Embed the query and use the FAISS index ---
            query_embedding = self.model.encode([query])
            faiss.normalize_L2(query_embedding) # Normalize for cosine similarity

            # Search the index
            similarities, indices = self.index.search(query_embedding.astype('float32'), top_k)

            results = []
            for i, idx in enumerate(indices[0]):
                if idx != -1 and similarities[0][i] >= similarity_threshold:
                    chunk = self.chunks[idx]
                    results.append({
                        'rank': i + 1,
                        'similarity': float(similarities[0][i]),
                        'content': chunk.get('content', ''),
                        'chunk_id': chunk.get('id', idx),
                        'source': chunk.get('source', 'Unknown')
                    })
            return results

        except Exception as e:
            print(f"❌ Error during search: {e}")
            return []

    def get_stats(self) -> Dict:
        # This method is correct and remains unchanged.
        return {
            'total_chunks': len(self.chunks),
            'is_ready': self.is_ready,
            'embedding_dimension': self.embeddings.shape[1] if self.embeddings is not None else 0,
            'total_characters': sum(len(chunk.get('content', '')) for chunk in self.chunks)
        }

    def list_sources(self) -> List[str]:
        # This method is correct and remains unchanged.
        sources = set(chunk.get('source', 'Unknown') for chunk in self.chunks)
        return sorted(list(sources))

# --- AUTHENTICATION SETUP ---
api_key = userdata.get('ANTHROPIC_API_KEY')
client = anthropic.Anthropic(api_key=api_key)

# --- SYSTEM PROMPTS ---
EMPATHETIC_SYSTEM_PROMPT = """You are an AI assistant specialized in answering questions about breast cancer, built for BCRC (Breast Cancer Resource Center). Your primary goal is to provide accurate, helpful, and deeply empathetic information using verified sources.

**Your Tools and Process:**
You have access to two tools:
1. **web_search**: Search current information from trusted breast cancer websites
2. **search_kb**: Search the BCRC knowledge base for verified internal information

**Recommended Process:**
1. First use web_search to find current, authoritative information
2. Then use search_kb to verify/supplement with BCRC's internal knowledge base
3. Synthesize information from both sources for a comprehensive answer

**Your Persona and Instructions:**
- **Empathetic Tone:** Always maintain a conversational, supportive, and empathetic tone. Acknowledge the user's feelings and the difficulty of their situation.
- **Direct Answers:** Provide a direct and cohesive answer. Do NOT show your work or internal thought process. Do not say "I will search for..." or "Based on the search results...". Use your tools silently to gather information and then present the final, synthesized answer directly to the user.
- **Cite Sources:** If you use information from searches, include all source URLs in a "Sources" list at the very end of your response. Distinguish between web sources and knowledge base sources.

**Context: Top 10 Frequently Asked Questions by Caregivers**
Be aware of these common caregiver concerns. If a user's question is similar in context to one of these, use this understanding to provide a particularly relevant and compassionate response.

1. **Supporting a loved one:** How to provide emotional/practical support after a diagnosis.
2. **Post-surgery recovery:** What to expect and how to help after surgery.
3. **Caregiver's mental health:** How to cope with the emotional toll (stress, fear, anxiety).
4. **Managing treatment side effects:** Helping with side effects from chemo, hormone therapy, etc.
5. **Learning from other caregivers:** Seeking solidarity and coping strategies from peers.
6. **Relationship & intimacy changes:** Navigating changes in the relationship during/after cancer.
7. **Finding resources:** Locating support groups, forums, and materials for caregivers.
8. **Communicating the diagnosis:** How to tell family, friends, and children.
9. **Commemorating milestones:** Gift ideas for the end of treatment or cancerversaries.
10. **International treatment access:** Helping a loved one from another country get treatment in the U.S."""

TRANSPARENT_SYSTEM_PROMPT = """You are an AI assistant specialized in answering questions about breast cancer, built for BCRC (Breast Cancer Resource Center). Your primary goal is to provide accurate, helpful, and empathetic information using verified sources.

**Your Tools and Process:**
You have access to two tools:
1. **web_search**: Search current information from trusted breast cancer websites
2. **search_kb**: Search the BCRC knowledge base for verified internal information

**IMPORTANT: Show your search process transparently. For each search:**
1. Explain what you're searching for and why
2. The user will see the search query you use
3. After receiving results, explain what you found and synthesize the information
4. Use specific quotes from the sources with proper citations
5. Show how web sources and knowledge base sources complement each other

**Recommended Process:**
1. Start with web_search to find current, authoritative information
2. Then use search_kb to verify/supplement with BCRC's internal knowledge
3. Explain how both sources align or differ
4. Synthesize for a comprehensive answer

Guidelines:
- Maintain a conversational, empathetic tone.
- Avoid excessive medical jargon unless necessary.
- Be transparent about your search process for both web and knowledge base.
- If information is conflicting between sources, explain different perspectives.
- If you cannot find an answer, say: "I'm sorry, I don't have an answer to that question. It may be helpful to consult BCRC directly at 512-524-2560 / FAX: 512-717-7545."

Remember: Your users can see your entire search process, which helps build trust and understanding."""

class BCRCAssistant:
    def __init__(self, debug_mode: bool = True, kb_store_path: Optional[str] = None):
        self.client = client
        self.debug_mode = debug_mode
        self.conversation_history = []

        # Track tool usage during conversation
        self.executed_kb_searches = []
        self.executed_web_searches = []

        # Initialize Knowledge Base
        self.kb = None
        self.kb_ready = False

        # Default KB path
        if kb_store_path is None:
            kb_store_path = "/content/drive/MyDrive/BCRC/knowledge_base_store"

        # Try to load knowledge base
        if os.path.exists(kb_store_path):
            self.load_knowledge_base(kb_store_path)
        else:
            print(f"⚠️  Knowledge base not found at: {kb_store_path}")
            print("   Assistant will work with web search only.")

    def load_knowledge_base(self, kb_store_path: str):
        """Load the pre-built knowledge base from the specified path."""
        print(f"🔄 Loading knowledge base from: {kb_store_path}")

        try:
            self.kb = KnowledgeBase(kb_store_path)
            self.kb_ready = self.kb.is_ready

            if self.kb_ready:
                stats = self.kb.get_stats()
                print(f"✅ Knowledge base loaded successfully!")
                print(f"📊 KB Stats: {stats['total_chunks']} chunks, {stats['total_characters']:,} characters")

                # Show available sources
                sources = self.kb.list_sources()
                print(f"📚 Sources available: {len(sources)}")
                for source in sources[:5]:  # Show first 5 sources
                    print(f"   - {source}")
                if len(sources) > 5:
                    print(f"   ... and {len(sources) - 5} more")
            else:
                print("❌ Failed to load knowledge base")
        except Exception as e:
            print(f"❌ Error loading knowledge base: {e}")
            self.kb_ready = False

    def search_kb_tool(self, query: str, top_k: int = 3) -> Dict[str, Any]:
        """
        Knowledge base search tool that the LLM can use.
        """
        if not self.kb_ready:
            return {
                "error": "Knowledge base not available",
                "results": []
            }

        try:
            results = self.kb.search(query, top_k=top_k)

            return {
                "query": query,
                "results_found": len(results),
                "results": results,
                "kb_stats": self.kb.get_stats()
            }
        except Exception as e:
            return {
                "error": f"Search failed: {str(e)}",
                "results": []
            }

    def log_debug(self, message: str, data: Any = None):
        """Debug logging function"""
        if self.debug_mode:
            print(f"\n🔍 DEBUG: {message}")
            if data:
                if isinstance(data, dict) or isinstance(data, list):
                    print(json.dumps(data, indent=2, default=str))
                else:
                    print(str(data))
            print("-" * 50)

    def create_message_with_search(self, question: str, mode: str = "empathetic") -> Optional[anthropic.types.Message]:
        """
        Creates a message to the Anthropic API with both web search and KB search enabled.

        Args:
            question: The user's question
            mode: Either "empathetic" (hides process) or "transparent" (shows process)
        """
        system_prompt = EMPATHETIC_SYSTEM_PROMPT if mode == "empathetic" else TRANSPARENT_SYSTEM_PROMPT

        # Configure tools - always include web_search, add search_kb if available
        tools = [{
            "type": "web_search_20250305",
            "name": "web_search",
            "allowed_domains": [
                "bcrc.org",
                "cancer.gov",
                "nationalbreastcancer.org",
                "breastcancer.org"
            ],
            "max_uses": 5
        }]

        # Add KB tool if available
        if self.kb_ready:
            tools.append({
                "type": "custom",
                "name": "search_kb",
                "description": "Search the BCRC knowledge base for relevant information to verify or supplement web search results",
                "input_schema": {
                    "type": "object",
                    "properties": {
                        "query": {
                            "type": "string",
                            "description": "Search query for the knowledge base"
                        },
                        "top_k": {
                            "type": "integer",
                            "description": "Number of results to return (default: 3)",
                            "default": 3
                        }
                    },
                    "required": ["query"]
                }
            })

        # Log the request being sent
        request_data = {
            "model": "claude-sonnet-4-20250514",
            "max_tokens": 4000,
            "system": system_prompt,
            "messages": [{"role": "user", "content": question}],
            "tools": tools
        }

        self.log_debug("API Request Details", {
            "model": request_data["model"],
            "system_prompt_length": len(system_prompt),
            "question": question,
            "mode": mode,
            "tools_configured": f"web_search + {'KB search' if self.kb_ready else 'no KB'}",
            "kb_status": "ready" if self.kb_ready else "not available"
        })

        try:
            print(f"🚀 Sending request to Claude Sonnet 4...")

            # Handle tool calls manually since we have a custom KB tool
            response = self._handle_tool_calls(request_data)

            # Log the final response structure
            if response:
                self.log_debug("Final API Response Structure", {
                    "response_id": response.id,
                    "model": response.model,
                    "usage": response.usage.__dict__ if response.usage else None,
                    "stop_reason": response.stop_reason,
                    "content_blocks_count": len(response.content),
                    "content_types": [block.type for block in response.content]
                })

            return response

        except Exception as e:
            print(f"❌ API Request Error: {str(e)}")
            self.log_debug("API Error Details", str(e))
            return None

    def _handle_tool_calls(self, request_data: Dict) -> Optional[anthropic.types.Message]:
        """
        Handle tool calls using Anthropic's client tool workflow.
        """
        messages = request_data["messages"].copy()

        while True:
            # Make API call
            response = self.client.messages.create(
                model=request_data["model"],
                max_tokens=request_data["max_tokens"],
                system=request_data["system"],
                messages=messages,
                tools=request_data["tools"]
            )

            self.log_debug("API Response Stop Reason", response.stop_reason)

            # Check if Claude wants to use tools
            if response.stop_reason == "tool_use":
                # Add Claude's response to conversation
                messages.append({
                    "role": "assistant",
                    "content": response.content
                })

                # Process each tool use in the response
                tool_results = []

                for content_block in response.content:
                    if content_block.type == "tool_use":
                        tool_name = content_block.name
                        tool_input = content_block.input
                        tool_use_id = content_block.id

                        self.log_debug(f"Tool Call: {tool_name}", {
                            "tool_use_id": tool_use_id,
                            "input": tool_input
                        })

                        if tool_name == "search_kb":
                            # Execute our custom KB search
                            kb_result = self.search_kb_tool(
                                query=tool_input.get("query", ""),
                                top_k=tool_input.get("top_k", 3)
                            )

                            # Track this search for analysis
                            self.executed_kb_searches.append(kb_result)

                            # Format result for Claude
                            tool_results.append({
                                "type": "tool_result",
                                "tool_use_id": tool_use_id,
                                "content": json.dumps(kb_result, indent=2)
                            })

                        # Note: web_search is handled automatically by Anthropic

                # Add tool results to conversation if we have any
                if tool_results:
                    messages.append({
                        "role": "user",
                        "content": tool_results
                    })

                    self.log_debug("Tool Results Added", f"Added {len(tool_results)} tool results")

                    # Continue the conversation with tool results
                    continue
                else:
                    # Only web search was used (handled by Anthropic), return response
                    return response
            else:
                # No tools used or conversation complete
                return response

    def analyze_response_content(self, response: anthropic.types.Message) -> Dict[str, Any]:
        """
        Deeply analyzes the response content to extract all information including KB searches.
        """
        analysis = {
            "text_blocks": [],
            "search_queries": [],
            "search_results": [],
            "kb_queries": [],
            "kb_results": [],
            "citations": [],
            "tool_uses": [],
            "errors": [],
            "sources_found": []
        }

        for i, block in enumerate(response.content):
            self.log_debug(f"Processing block {i}", {
                "type": block.type,
                "attributes": [attr for attr in dir(block) if not attr.startswith('_')]
            })

            if block.type == 'text':
                analysis["text_blocks"].append({
                    "index": i,
                    "text": block.text,
                    "length": len(block.text)
                })

                # Extract sources from text (if they're listed at the end)
                text = block.text
                if "Sources:" in text or "**Sources:**" in text:
                    # Extract URLs from the sources section
                    import re
                    urls = re.findall(r'https?://[^\s\)]+', text)
                    analysis["sources_found"].extend(urls)

            elif block.type == 'tool_use':
                tool_info = {
                    "index": i,
                    "tool_name": getattr(block, 'name', 'unknown'),
                    "tool_id": getattr(block, 'id', 'unknown'),
                    "input": getattr(block, 'input', {})
                }
                analysis["tool_uses"].append(tool_info)

                if getattr(block, 'name', '') == 'web_search':
                    query = block.input.get('query', str(block.input)) if hasattr(block, 'input') else 'unknown query'
                    analysis["search_queries"].append(query)
                elif getattr(block, 'name', '') == 'search_kb':
                    query = block.input.get('query', str(block.input)) if hasattr(block, 'input') else 'unknown query'
                    analysis["kb_queries"].append(query)

            # Handle different types of tool results
            elif hasattr(block, 'type'):
                # Check for web search results (handled by Anthropic)
                if hasattr(block, 'type') and 'web_search' in str(block.type):
                    # This is a web search result
                    search_result = {
                        "title": getattr(block, 'title', 'No title'),
                        "url": getattr(block, 'url', 'No URL'),
                        "snippet": getattr(block, 'snippet', 'No snippet'),
                        "page_age": getattr(block, 'page_age', 'Unknown age')
                    }
                    analysis["search_results"].append(search_result)

            # Handle other possible block types
            else:
                self.log_debug(f"Unknown block type: {block.type}", {
                    "block_content": str(block)[:200],
                    "block_attrs": [attr for attr in dir(block) if not attr.startswith('_')]
                })

        # Note: KB results are captured during tool execution and stored in self.executed_kb_searches
        # Add them to the analysis
        for kb_search in self.executed_kb_searches:
            analysis["kb_results"].append(kb_search)
            if 'query' in kb_search:
                analysis["kb_queries"].append(kb_search['query'])

        return analysis

    def create_search_quality_audit(self, analysis: Dict[str, Any], question: str) -> pd.DataFrame:
        """
        Creates a table to audit search quality - did we search the right sites and find relevant content?
        """
        search_audit_data = []

        # Get all sources (from search results, KB results, and from final response)
        all_sources = []

        # From web search results
        for result in analysis["search_results"]:
            all_sources.append({
                "url": result["url"],
                "title": result["title"],
                "snippet": result["snippet"],
                "source_type": "Web Search Result"
            })

        # From KB results
        for kb_result in analysis["kb_results"]:
            if 'results' in kb_result:
                for result in kb_result['results']:
                    all_sources.append({
                        "url": f"KB Chunk {result.get('chunk_id', 'Unknown')}",
                        "title": f"KB Content (similarity: {result.get('similarity', 0):.3f})",
                        "snippet": result.get('content', '')[:200],
                        "source_type": "Knowledge Base"
                    })

        # From sources listed in response
        for url in analysis["sources_found"]:
            # Avoid duplicates
            if not any(s["url"] == url for s in all_sources):
                all_sources.append({
                    "url": url,
                    "title": "Listed in Response",
                    "snippet": "Source cited in final response",
                    "source_type": "Response Citation"
                })

        for source in all_sources:
            # Determine if it's an allowed/trusted domain
            if source["source_type"] == "Knowledge Base":
                is_trusted = "✅ (Internal KB)"
                domain = "BCRC Knowledge Base"
            else:
                trusted_domains = ["bcrc.org", "cancer.gov", "nationalbreastcancer.org", "breastcancer.org"]
                is_trusted = "✅" if any(domain in source["url"] for domain in trusted_domains) else "❌"
                domain = source["url"].split("/")[2] if "/" in source["url"] else "Unknown"

            # Assess relevance based on title and snippet content
            question_keywords = question.lower().split()
            content_to_check = f"{source['title']} {source['snippet']}".lower()

            keyword_matches = sum(1 for keyword in question_keywords if keyword in content_to_check)
            relevance_score = f"{keyword_matches}/{len(question_keywords)}"

            # Determine relevance level
            if keyword_matches >= len(question_keywords) * 0.7:
                relevance = "High"
            elif keyword_matches >= len(question_keywords) * 0.4:
                relevance = "Medium"
            else:
                relevance = "Low"

            search_audit_data.append({
                "Source": source["url"],
                "Domain": domain,
                "Is Trusted": is_trusted,
                "Source Type": source["source_type"],
                "Title": source["title"][:50] + "..." if len(source["title"]) > 50 else source["title"],
                "Keyword Matches": relevance_score,
                "Relevance": relevance,
                "Contains Answer": "🔍 Manual Review Needed"  # This requires human judgment
            })

        return pd.DataFrame(search_audit_data) if search_audit_data else pd.DataFrame(columns=[
            "Source", "Domain", "Is Trusted", "Source Type", "Title", "Keyword Matches", "Relevance", "Contains Answer"
        ])

    def create_response_quality_audit(self, response: anthropic.types.Message, analysis: Dict[str, Any], question: str) -> pd.DataFrame:
        """
        Creates a table to audit response quality - did the LLM answer correctly and comprehensively?
        """
        # Extract the main response text
        response_text = ""
        for block in analysis["text_blocks"]:
            response_text += block["text"] + " "

        # Count sources
        web_sources = len(analysis["search_results"]) + len(analysis["sources_found"])
        kb_sources = sum(len(kb_result.get('results', [])) for kb_result in analysis["kb_results"])

        # Define quality criteria
        quality_criteria = [
            {
                "Criteria": "Answered the Question",
                "Description": "Response directly addresses what was asked",
                "Assessment": "🔍 Manual Review Needed",
                "Notes": "Check if the question was fully addressed"
            },
            {
                "Criteria": "Used Web Sources",
                "Description": "Information came from reliable web sources",
                "Assessment": "✅ Yes" if web_sources > 0 else "❌ No",
                "Notes": f"Found {web_sources} web sources"
            },
            {
                "Criteria": "Used Knowledge Base",
                "Description": "Verified information against internal KB",
                "Assessment": "✅ Yes" if kb_sources > 0 else "❌ No" if analysis["kb_queries"] else "⚠️ KB Not Available",
                "Notes": f"Found {kb_sources} KB sources from {len(analysis['kb_queries'])} queries"
            },
            {
                "Criteria": "Cross-Verification",
                "Description": "Used both web and KB sources for verification",
                "Assessment": "✅ Yes" if (web_sources > 0 and kb_sources > 0) else "⚠️ Partial" if (web_sources > 0 or kb_sources > 0) else "❌ No",
                "Notes": f"Web: {web_sources}, KB: {kb_sources}"
            },
            {
                "Criteria": "Provided Specific Information",
                "Description": "Gave concrete, actionable details",
                "Assessment": "✅ Yes" if len(response_text) > 500 else "⚠️ Limited",
                "Notes": f"Response length: {len(response_text)} characters"
            },
            {
                "Criteria": "Included Proper Citations",
                "Description": "Sources were properly referenced",
                "Assessment": "✅ Yes" if "Sources:" in response_text or analysis["sources_found"] else "❌ No",
                "Notes": f"Found {len(analysis['sources_found'])} cited sources"
            },
            {
                "Criteria": "Maintained Empathetic Tone",
                "Description": "Response was supportive and understanding",
                "Assessment": "🔍 Manual Review Needed",
                "Notes": "Check for empathetic language and supportive phrasing"
            },
            {
                "Criteria": "Avoided Medical Advice",
                "Description": "Didn't provide diagnosis or treatment advice",
                "Assessment": "🔍 Manual Review Needed",
                "Notes": "Should provide information but direct to healthcare providers"
            },
            {
                "Criteria": "Accuracy Check",
                "Description": "Information is medically accurate",
                "Assessment": "🔍 Expert Review Required",
                "Notes": "Requires medical professional validation"
            },
            {
                "Criteria": "Completeness",
                "Description": "Covered all major aspects of the question",
                "Assessment": "🔍 Manual Review Needed",
                "Notes": "Check against comprehensive information sources"
            }
        ]

        return pd.DataFrame(quality_criteria)

    def display_detailed_analysis(self, response: anthropic.types.Message, question: str, mode: str):
        """
        Displays a comprehensive analysis of the response.
        """
        print("\n" + "="*100)
        print(f"🎯 BCRC AI ASSISTANT - DETAILED ANALYSIS")
        print(f"📅 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"🤖 Model: Claude Sonnet 4")
        print(f"🔧 Mode: {mode.title()}")
        print("="*100)

        # Question
        print(f"\n❓ **QUESTION:**")
        print(f"   {question}")

        # Usage statistics
        if response.usage:
            print(f"\n📊 **TOKEN USAGE:**")
            print(f"   Input tokens: {response.usage.input_tokens:,}")
            print(f"   Output tokens: {response.usage.output_tokens:,}")
            print(f"   Total tokens: {response.usage.input_tokens + response.usage.output_tokens:,}")

        # Detailed content analysis
        analysis = self.analyze_response_content(response)

        # Search queries performed
        if analysis["search_queries"]:
            print(f"\n🔍 **SEARCH QUERIES PERFORMED:**")
            for i, query in enumerate(analysis["search_queries"], 1):
                print(f"   {i}. \"{query}\"")

        # Search results found
        if analysis["search_results"]:
            print(f"\n📚 **SEARCH RESULTS FOUND:**")
            for i, result in enumerate(analysis["search_results"], 1):
                print(f"\n   Result {i}:")
                print(f"   📰 Title: {result['title']}")
                print(f"   🔗 URL: {result['url']}")
                print(f"   📄 Snippet: {result['snippet'][:150]}...")
                print(f"   📅 Page Age: {result['page_age']}")

        # Final assistant response
        final_text = ""
        for block in analysis["text_blocks"]:
            final_text += block["text"] + "\n"

        if final_text:
            print(f"\n🤖 **ASSISTANT'S FINAL RESPONSE:**")
            print("-" * 60)
            display(Markdown(final_text))
            print("-" * 60)

        # Quality Audit Tables
        print(f"\n📊 **SEARCH QUALITY AUDIT:**")
        print("(Evaluates: Did we search the right sites? Do they contain the answer?)")
        search_audit_df = self.create_search_quality_audit(analysis, question)
        if not search_audit_df.empty:
            display(HTML(search_audit_df.to_html(index=False, escape=False)))
        else:
            print("No search data available for audit")

        print(f"\n📋 **RESPONSE QUALITY AUDIT:**")
        print("(Evaluates: Did the LLM answer correctly and appropriately?)")
        response_audit_df = self.create_response_quality_audit(response, analysis, question)
        display(HTML(response_audit_df.to_html(index=False, escape=False)))

        # Raw response data (if debug mode)
        if self.debug_mode:
            print(f"\n🔬 **RAW RESPONSE DATA:**")
            self.log_debug("Complete Response Analysis", analysis)

    def save_comprehensive_report(self, response: anthropic.types.Message, question: str, mode: str) -> str:
        """
        Saves a comprehensive markdown report with all details.
        """
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        safe_question = re.sub(r'[^\w\s-]', '', question).strip()
        safe_question = re.sub(r'[-\s]+', '_', safe_question)[:50]
        filename = f"BCRC_Comprehensive_Report_{safe_question}_{timestamp}.md"

        analysis = self.analyze_response_content(response)

        # Build markdown content
        markdown_content = [
            f"# BCRC AI Assistant - Comprehensive Report\n",
            f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  ",
            f"**Model:** Claude Sonnet 4  ",
            f"**Mode:** {mode.title()}  \n",
            f"## Question\n{question}\n",
        ]

        # Usage statistics
        if response.usage:
            markdown_content.extend([
                f"## Token Usage\n",
                f"- **Input tokens:** {response.usage.input_tokens:,}",
                f"- **Output tokens:** {response.usage.output_tokens:,}",
                f"- **Total tokens:** {response.usage.input_tokens + response.usage.output_tokens:,}\n"
            ])

        # Search process
        if analysis["search_queries"] or analysis["kb_queries"]:
            markdown_content.append("## Search Process\n")

            if analysis["search_queries"]:
                markdown_content.append("### Web Searches\n")
                for i, query in enumerate(analysis["search_queries"], 1):
                    markdown_content.append(f"**Search {i}:** `{query}`\n")

            if analysis["kb_queries"]:
                markdown_content.append("### Knowledge Base Searches\n")
                for i, query in enumerate(analysis["kb_queries"], 1):
                    markdown_content.append(f"**KB Search {i}:** `{query}`\n")

        # Search results
        if analysis["search_results"]:
            markdown_content.append("## Web Search Results\n")
            for i, result in enumerate(analysis["search_results"], 1):
                markdown_content.extend([
                    f"### Web Result {i}\n",
                    f"- **Title:** {result['title']}",
                    f"- **URL:** [{result['url']}]({result['url']})",
                    f"- **Snippet:** {result['snippet']}",
                    f"- **Page Age:** {result['page_age']}\n"
                ])

        # KB results
        if analysis["kb_results"]:
            markdown_content.append("## Knowledge Base Results\n")
            for i, kb_result in enumerate(analysis["kb_results"], 1):
                markdown_content.extend([
                    f"### KB Search {i}\n",
                    f"- **Query:** `{kb_result.get('query', 'Unknown')}`",
                    f"- **Results Found:** {kb_result.get('results_found', 0)}\n"
                ])

                if 'results' in kb_result and kb_result['results']:
                    for j, result in enumerate(kb_result['results'][:3], 1):
                        similarity = result.get('similarity', 0)
                        content = result.get('content', '')[:300] + "..."
                        markdown_content.extend([
                            f"#### KB Result {j} (Similarity: {similarity:.3f})\n",
                            content + "\n"
                        ])

        # Quality audit tables
        search_audit_df = self.create_search_quality_audit(analysis, question)
        response_audit_df = self.create_response_quality_audit(response, analysis, question)

        if not search_audit_df.empty:
            markdown_content.extend([
                "## Search Quality Audit\n",
                "*Evaluates: Did we search the right sites? Do they contain the answer?*\n",
                search_audit_df.to_markdown(index=False) + "\n"
            ])

        markdown_content.extend([
            "## Response Quality Audit\n",
            "*Evaluates: Did the LLM answer correctly and appropriately?*\n",
            response_audit_df.to_markdown(index=False) + "\n"
        ])

        # Final response
        final_text = ""
        for block in analysis["text_blocks"]:
            final_text += block["text"] + "\n"

        if final_text:
            markdown_content.extend([
                "## Assistant's Response\n",
                final_text
            ])

        # Save file
        full_content = "\n".join(markdown_content)
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(full_content)
            print(f"✅ Comprehensive report saved: {filename}")
            return filename
        except Exception as e:
            print(f"❌ Error saving report: {e}")
            return None

    def ask_question(self, question: str, mode: str = "empathetic", save_report: bool = True) -> Optional[anthropic.types.Message]:
        """
        Main method to ask a question with full analysis.

        Args:
            question: The user's question
            mode: "empathetic" (direct answers) or "transparent" (shows process)
            save_report: Whether to save a comprehensive report
        """
        # Reset tracking for this question
        self.executed_kb_searches = []
        self.executed_web_searches = []

        print(f"🎯 Processing question in {mode} mode...")

        # Get response from Claude
        response = self.create_message_with_search(question, mode)

        if response:
            # Display detailed analysis
            self.display_detailed_analysis(response, question, mode)

            # Save comprehensive report
            if save_report:
                self.save_comprehensive_report(response, question, mode)

            return response

        return None

# --- CONVENIENCE FUNCTIONS ---

def check_knowledge_base(kb_store_path: str = "/content/drive/MyDrive/BCRC/knowledge_base_store") -> bool:
    """Check if knowledge base exists and show stats"""
    if not os.path.exists(kb_store_path):
        print(f"❌ Knowledge base not found at: {kb_store_path}")
        return False

    try:
        kb = KnowledgeBase(kb_store_path)
        if kb.is_ready:
            stats = kb.get_stats()
            sources = kb.list_sources()
            print(f"✅ Knowledge Base Status:")
            print(f"   📊 {stats['total_chunks']} chunks, {stats['total_characters']:,} characters")
            print(f"   📚 {len(sources)} sources available")
            print(f"   📁 Path: {kb_store_path}")
            return True
        else:
            print(f"❌ Knowledge base found but failed to load")
            return False
    except Exception as e:
        print(f"❌ Error checking knowledge base: {e}")
        return False

def ask_empathetic(question: str, debug: bool = True, kb_store_path: Optional[str] = None) -> Optional[anthropic.types.Message]:
    """Ask a question in empathetic mode (hides search process)"""
    assistant = BCRCAssistant(debug_mode=debug, kb_store_path=kb_store_path)
    return assistant.ask_question(question, mode="empathetic")

def ask_transparent(question: str, debug: bool = True, kb_store_path: Optional[str] = None) -> Optional[anthropic.types.Message]:
    """Ask a question in transparent mode (shows search process)"""
    assistant = BCRCAssistant(debug_mode=debug, kb_store_path=kb_store_path)
    return assistant.ask_question(question, mode="transparent")

def interactive_session(kb_store_path: Optional[str] = None):
    """Start an interactive session with mode selection and optional KB"""
    assistant = BCRCAssistant(debug_mode=True, kb_store_path=kb_store_path)

    print("🎯 BCRC AI Assistant - Interactive Session")
    print("Available modes:")
    print("  1. Empathetic (direct, compassionate answers)")
    print("  2. Transparent (shows search process)")

    if assistant.kb_ready:
        kb_stats = assistant.kb.get_stats()
        print(f"🧠 Knowledge Base: ✅ Ready ({kb_stats['total_chunks']} chunks)")
    else:
        print("🧠 Knowledge Base: ❌ Not available")

    while True:
        print("\n" + "="*80)
        question = input("Ask a breast cancer related question (or 'quit' to exit): ")
        if question.lower() == 'quit':
            print("👋 Exiting interactive session.")
            break

        mode_choice = input("Choose mode (1=empathetic, 2=transparent, default=1): ").strip()
        mode = "transparent" if mode_choice == "2" else "empathetic"

        assistant.ask_question(question, mode=mode, save_report=True)

# --- EXAMPLE USAGE ---
if __name__ == "__main__":
    print("🚀 BCRC AI Assistant initialized with Claude Sonnet 4")
    print("📋 Available functions:")
    print("  - check_knowledge_base()")
    print("  - ask_empathetic(question)")
    print("  - ask_transparent(question)")
    print("  - interactive_session()")

    # Check if default KB exists
    default_kb_path = "/content/drive/MyDrive/BCRC/knowledge_base_store"
    print(f"\n🔍 Checking default knowledge base...")
    kb_available = check_knowledge_base(default_kb_path)

    # Example usage
    sample_question = "My wife is starting her first cycle of chemotherapy for breast cancer next week. I've read about the common side effects like nausea and fatigue, but what are the less obvious, more urgent symptoms I should be watching for? When should I be worried enough to call her oncology team immediately or take her to the emergency room?"
    print(f"\n🧪 Testing with sample question: '{sample_question}'")

    if kb_available:
        print(f"\n💡 Knowledge base found! Testing with full capabilities...")
        print("\n" + "="*50)
        print("TESTING EMPATHETIC MODE (WITH KB)")
        print("="*50)
        ask_empathetic(sample_question)
    else:
        print(f"\n⚠️  No knowledge base found. Testing with web search only...")
        print("\n" + "="*50)
        print("TESTING EMPATHETIC MODE (WEB ONLY)")
        print("="*50)
        ask_empathetic(sample_question, kb_store_path=None)

    print(f"\n💡 To use knowledge base, ensure it exists at:")
    print(f"   {default_kb_path}")
    print(f"\n💡 To start interactive session, run: interactive_session()")

✅ Required packages loaded successfully
🚀 BCRC AI Assistant initialized with Claude Sonnet 4
📋 Available functions:
  - check_knowledge_base()
  - ask_empathetic(question)
  - ask_transparent(question)
  - interactive_session()

🔍 Checking default knowledge base...
🧠 Initializing sentence transformer model: all-MiniLM-L6-v2
💾 Loading Knowledge Base data from: /content/drive/MyDrive/BCRC/knowledge_base_store
✅ Loaded 113 chunks
✅ Loaded embeddings: (113, 384)
✅ Loaded FAISS index: 113 vectors
🎯 Knowledge Base is ready!
✅ Knowledge Base Status:
   📊 113 chunks, 51,498 characters
   📚 1 sources available
   📁 Path: /content/drive/MyDrive/BCRC/knowledge_base_store

🧪 Testing with sample question: 'My wife is starting her first cycle of chemotherapy for breast cancer next week. I've read about the common side effects like nausea and fatigue, but what are the less obvious, more urgent symptoms I should be watching for? When should I be worried enough to call her oncology team immediately or 

Todo: plug in the md file, check if embeddings is happening right or not, and if llm can access the kb or not, then run, question is already set, verify the response and then report to Dr.Lond/Yuan.


In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
# STEP 1: MOUNT GOOGLE DRIVE
# -----------------------------------------------------------------------
from google.colab import drive
import os

print("\n🔗 Mounting Google Drive...")
try:
    drive.mount('/content/drive')
    print("✅ Google Drive mounted successfully.")
except Exception as e:
    print(f"❌ Error mounting Google Drive: {e}")


# STEP 2: UPDATED KNOWLEDGE BASE CLASS WITH SAVE/LOAD
# -----------------------------------------------------------------------
import re
import warnings
import json
from typing import List, Dict, Any

import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from IPython.display import display, HTML

warnings.filterwarnings('ignore')

class KnowledgeBase:
    """
    An updated KnowledgeBase class with improved chunking and the
    ability to save/load the processed state from disk.
    """
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        print("🧠 Initializing Knowledge Base with model:", model_name)
        self.model = SentenceTransformer(model_name)
        self.chunks = []
        self.embeddings = None
        self.index = None
        self.is_ready = False

    def _split_markdown_into_chunks(self, content: str, chunk_size: int = 350) -> List[str]:
        """
        A more advanced chunking strategy.
        1. Splits by headers of all levels.
        2. Splits by list items.
        3. Falls back to paragraph splitting for oversized chunks.
        """
        all_chunks = []
        # Split by headers (e.g., #, ##, ###) to respect document structure
        # The regex splits the text *before* each header
        header_splits = re.split(r'\n(?=^#{1,3}\s)', content)

        for section in header_splits:
            if not section.strip():
                continue

            # Further split sections by bullet points (a common way to list distinct ideas)
            bullet_splits = re.split(r'\n\s*(?=\*|\-|\d+\.)\s', section)

            for sub_section in bullet_splits:
                if len(sub_section) > chunk_size:
                    # If a chunk is still too big, split by paragraphs
                    paragraphs = sub_section.split('\n\n')
                    for p in paragraphs:
                        if p.strip():
                            all_chunks.append(p.strip())
                elif sub_section.strip():
                    all_chunks.append(sub_section.strip())

        return all_chunks

    def load_and_process_markdown(self, file_path: str, chunk_size: int = 350):
        print(f"\n📖 Loading and processing markdown file: {file_path}")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            chunk_texts = self._split_markdown_into_chunks(content, chunk_size)

            self.chunks = [{
                'id': i,
                'content': text,
                'length': len(text),
                'source': file_path
            } for i, text in enumerate(chunk_texts)]

            print(f"✅ Created {len(self.chunks)} smaller, topic-focused chunks.")
            self._create_embeddings_and_index()
            return len(self.chunks)

        except FileNotFoundError:
            print(f"❌ ERROR: File not found at '{file_path}'.")
            return 0
        except Exception as e:
            print(f"❌ An error occurred during processing: {e}")
            return 0

    def _create_embeddings_and_index(self):
        if not self.chunks:
            print("❌ No chunks available to embed.")
            return

        print("🔮 Creating embeddings and FAISS index...")
        chunk_texts = [chunk['content'] for chunk in self.chunks]
        self.embeddings = self.model.encode(chunk_texts, show_progress_bar=True)

        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)

        normalized_embeddings = self.embeddings.copy()
        faiss.normalize_L2(normalized_embeddings)
        self.index.add(normalized_embeddings.astype('float32'))

        self.is_ready = True
        print("✅ Embeddings and index created successfully.")

    def save_to_disk(self, save_path: str):
        """Saves the entire knowledge base state to a directory."""
        if not self.is_ready:
            print("❌ Knowledge base is not ready. Nothing to save.")
            return

        print(f"\n💾 Saving knowledge base to: {save_path}")
        os.makedirs(save_path, exist_ok=True)

        # 1. Save chunks
        with open(os.path.join(save_path, 'chunks.json'), 'w') as f:
            json.dump(self.chunks, f)

        # 2. Save embeddings
        np.save(os.path.join(save_path, 'embeddings.npy'), self.embeddings)

        # 3. Save FAISS index
        faiss.write_index(self.index, os.path.join(save_path, 'kb.index'))

        print("✅ Knowledge base saved successfully.")

    def load_from_disk(self, load_path: str) -> bool:
        """Loads a pre-processed knowledge base from a directory."""
        print(f"\n💿 Attempting to load knowledge base from: {load_path}")
        try:
            # 1. Load chunks
            with open(os.path.join(load_path, 'chunks.json'), 'r') as f:
                self.chunks = json.load(f)

            # 2. Load embeddings
            self.embeddings = np.load(os.path.join(load_path, 'embeddings.npy'))

            # 3. Load FAISS index
            self.index = faiss.read_index(os.path.join(load_path, 'kb.index'))

            self.is_ready = True
            print(f"✅ Knowledge base loaded successfully with {len(self.chunks)} chunks.")
            return True
        except Exception as e:
            print(f"❌ Failed to load knowledge base: {e}")
            self.is_ready = False
            return False

    def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        if not self.is_ready:
            print("❌ Knowledge base is not ready for search.")
            return []

        query_embedding = self.model.encode([query])
        faiss.normalize_L2(query_embedding)

        similarities, indices = self.index.search(query_embedding.astype('float32'), top_k)

        return [{
            'rank': i + 1,
            'similarity': float(similarities[0][i]),
            'content': self.chunks[idx]['content']
        } for i, idx in enumerate(indices[0]) if idx != -1]

# --- Configuration ---
KB_FILE_PATH = "/content/drive/MyDrive/BCRC/Breast Cancer Chemotherapy Knowledge Base_.md"
KB_SAVE_PATH = "/content/drive/MyDrive/BCRC/knowledge_base_store"


# =======================================================================
# PART 1: BUILD AND SAVE WORKFLOW (RUN THIS ONCE)
# =======================================================================
print("\n" + "#"*70)
print("### PART 1: BUILD, VERIFY, AND SAVE KNOWLEDGE BASE ###")
print("#"*70)

# 1. Initialize a new Knowledge Base
kb_builder = KnowledgeBase()

# 2. Load and process the source file with a smaller chunk size
# This also creates the embeddings and index in memory.
num_chunks = kb_builder.load_and_process_markdown(KB_FILE_PATH, chunk_size=350)

# 3. Verify the new, smaller chunks
if num_chunks > 0:
    print("\n📋 Verifying the new chunking strategy (showing first 15 chunks):")
    chunks_df = pd.DataFrame(kb_builder.chunks)
    chunks_df['content_preview'] = chunks_df['content'].str.slice(0, 150) + '...'
    display(HTML(chunks_df[['id', 'length', 'content_preview']].head(15).to_html(index=False)))

    # 4. Save the processed knowledge base to Google Drive
    kb_builder.save_to_disk(KB_SAVE_PATH)


# =======================================================================
# PART 2: LOAD AND USE WORKFLOW (USE THIS IN YOUR MAIN SCRIPT)
# =======================================================================
print("\n" + "#"*70)
print("### PART 2: LOAD PRE-BUILT KNOWLEDGE BASE AND SEARCH ###")
print("#"*70)

# 1. Initialize a new KB object
# Note: The model name should match the one used for building
kb_user = KnowledgeBase()

# 2. Load the entire pre-processed state from disk
is_loaded = kb_user.load_from_disk(KB_SAVE_PATH)

# 3. If loaded successfully, perform a search immediately
if is_loaded:
    print("\n⚡ Performing search with the loaded knowledge base...")
    test_query = "What is Taxol and what are its side effects?"
    search_results = kb_user.search(test_query, top_k=5)

    if search_results:
        print(f"\n🔍 Search Results for: '{test_query}'")
        results_df = pd.DataFrame(search_results)
        results_df['content'] = results_df['content'].str.wrap(100) # Wrap text for readability
        display(HTML(results_df.to_html(index=False)))


🔗 Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive mounted successfully.

######################################################################
### PART 1: BUILD, VERIFY, AND SAVE KNOWLEDGE BASE ###
######################################################################
🧠 Initializing Knowledge Base with model: all-MiniLM-L6-v2

📖 Loading and processing markdown file: /content/drive/MyDrive/BCRC/Breast Cancer Chemotherapy Knowledge Base_.md
✅ Created 113 smaller, topic-focused chunks.
🔮 Creating embeddings and FAISS index...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Embeddings and index created successfully.

📋 Verifying the new chunking strategy (showing first 15 chunks):


id,length,content_preview
0,70,# **A Comprehensive Knowledge Base on Chemotherapy for Breast Cancer**...
1,64,## **Section 1: Fundamentals of Chemotherapy for Breast Cancer**...
2,413,"This section establishes the foundational principles of chemotherapy as applied to breast cancer. It defines the treatment, outlines its objectives, a..."
3,73,### **1.1 What is Chemotherapy? A Systemic Approach to Cancer Treatment**...
4,733,"Chemotherapy is a form of medical treatment that employs cytotoxic drugs, meaning substances that are toxic to cells, to either destroy cancer cells o..."
5,479,A defining feature of chemotherapy is its systemic nature. The drugs are most often administered intravenously (into a vein) or orally (by mouth). Onc...
6,78,### **1.2 The Goals of Chemotherapy: From Curative Intent to Disease Control**...
7,173,"The application of chemotherapy in breast cancer is guided by two principal objectives, which are determined by the stage of the disease and the overa..."
8,731,"1. **To Treat Cancer:** This is the primary goal and can be further broken down based on the clinical context. In early-stage breast cancer, chemother..."
9,300,"The specific goal of chemotherapy is therefore highly contextual. For a patient with early-stage disease, it is part of an aggressive, finite campaign..."



💾 Saving knowledge base to: /content/drive/MyDrive/BCRC/knowledge_base_store
✅ Knowledge base saved successfully.

######################################################################
### PART 2: LOAD PRE-BUILT KNOWLEDGE BASE AND SEARCH ###
######################################################################
🧠 Initializing Knowledge Base with model: all-MiniLM-L6-v2

💿 Attempting to load knowledge base from: /content/drive/MyDrive/BCRC/knowledge_base_store
✅ Knowledge base loaded successfully with 113 chunks.

⚡ Performing search with the loaded knowledge base...

🔍 Search Results for: 'What is Taxol and what are its side effects?'


rank,similarity,content
1,0.459391,"### **5.1 Common Physical Side Effects: Hematologic, Gastrointestinal, and Dermatologic**"
2,0.44686,"* **Peripheral Neuropathy:** Certain chemotherapy drugs, most notably the taxanes (paclitaxel and\ndocetaxel), can cause damage to the peripheral nerves, which are the nerves outside of the brain and\nspinal cord.1 This results in symptoms of tingling, numbness, burning, or pain, typically in a\n""stocking-glove"" distribution affecting the hands and feet.1 While these symptoms often improve or\nresolve after treatment ends, a significant percentage of patients experience long-term, chronic\nneuropathy. One study found that two years after starting treatment, over 40% of women still had\nsymptoms, with 10% rating them as severe. Chronic neuropathy can substantially impair quality of\nlife and increase the risk of falls due to loss of sensation in the feet.14 Risk factors for\ndeveloping persistent neuropathy include older age, being overweight or obese, and having pre-\nexisting neuropathy from conditions like diabetes.14 * **Cancer-Related Cognitive Dysfunction\n(""Chemobrain""):** Many patients report experiencing cognitive changes, often referred to as\n""chemobrain"" or ""chemo fog."" This is characterized by difficulties with short-term memory,\nconcentration, word-finding, and multitasking.17 Patients may describe it as a feeling of mental\ncloudiness. Research shows that these cognitive issues are real and can be measured, with studies\nindicating changes in the function and structure of the brain's frontal lobe, the region responsible\nfor executive function and complex thought.18 While the term ""chemobrain"" points to chemotherapy,\nthe causes are likely multifactorial. The cancer itself, the psychological stress of diagnosis,\nsurgery, and other therapies can all contribute. Symptoms can appear before treatment even begins,\npersist during therapy, and for some, last for months or even years after treatment is complete.18"
3,0.430653,"* **Taxanes:** This class of drugs works by interfering with the cell's internal skeleton,\npreventing cancer cells from dividing. * Paclitaxel (brand name: Taxol) * Paclitaxel\nAlbumin-stabilized Nanoparticle Formulation (brand name: Abraxane) * Docetaxel (brand name:\nTaxotere) * **Anthracyclines:** These drugs damage the DNA of cancer cells, preventing them from\nreplicating. * Doxorubicin Hydrochloride (brand name: Adriamycin), often referred to as ""the Red\nDevil"" due to its color and potent effects. * Epirubicin Hydrochloride (brand name: Ellence) *\n**Alkylating Agents:** These agents directly damage DNA to stop cancer cells from making copies of\nthemselves. * Cyclophosphamide (brand name: Cytoxan) * Thiotepa (brand name: Tepadina) *\n**Antimetabolites:** These drugs mimic the normal building blocks of DNA and RNA, disrupting the\nprocess of cell replication. * Capecitabine (brand name: Xeloda), an oral chemotherapy. *\nFluorouracil (commonly known as 5-FU) * Gemcitabine Hydrochloride (brand name: Gemzar) *\n**Other Microtubule Inhibitors:** * Eribulin Mesylate (brand name: Halaven) * Ixabepilone\n(brand name: Ixempra) * Vinblastine Sulfate"
4,0.421396,"| Side Effect | Clinical Description | Management Strategies (Medical) | Management Strategies\n(Lifestyle/Supportive) | Source(s) | | :---- | :---- | :---- | :---- | :---- | | **Neutropenia** |\nLow white blood cell count leading to high infection risk. | Prophylactic G-CSF injections (e.g.,\nNeulasta); dose reduction or delay of chemo; antibiotics for fever. | Meticulous hygiene; avoiding\ncrowds and sick individuals; daily temperature monitoring; eating well-cooked foods. | 1 | |\n**Nausea & Vomiting** | Stimulation of the brain's vomiting center and irritation of the stomach\nlining. | Proactive anti-emetic medications (e.g., ondansetron, aprepitant) given before and after\nchemo. | Eating small, frequent, bland meals (crackers, toast); staying hydrated; avoiding strong\nodors; trying peppermint or ginger. | 17 | | **Fatigue** | Profound physical, emotional, and mental\nexhaustion. | Treatment of underlying causes like anemia; sometimes stimulant medications are\nconsidered. | Balancing gentle exercise (like walking) with periods of rest; prioritizing sleep;\nasking for help with daily tasks. | 17 | | **Peripheral Neuropathy** | Nerve damage causing\nnumbness, tingling, or pain in hands and feet. | Dose modification of the offending drug; pain\nmedications (e.g., duloxetine, gabapentin). | Protecting hands and feet from injury; wearing\nsupportive footwear; gentle exercise to improve balance; avoiding extreme temperatures. | 1 | |\n**""Chemobrain""** | Cognitive deficits in memory, focus, and multitasking. | No standard drug\ntreatment; management of contributing factors like fatigue, anxiety, or sleep problems. | Using\nplanners and calendars; reducing multitasking; brain exercises (puzzles); physical activity; asking\nfor feedback in conversations. | 17 | | **Mouth Sores** | Inflammation and ulceration of the mucosal\nlining of the mouth. | Prescription ""magic mouthwash"" containing anesthetic, antacid, and other\nagents; pain medication. | Meticulous oral hygiene with a soft brush; avoiding spicy, acidic, or\nrough foods; eating soft, cool foods; sucking on ice chips during infusion. | 1 | | **Hair Loss** |\nDamage to rapidly dividing cells in hair follicles. | Scalp cooling caps may help reduce hair loss\nfor some patients with certain chemo regimens. | Preparing emotionally; considering cutting hair\nshort or shaving head proactively; exploring wigs, scarves, and hats. | 1 |"
5,0.408625,"Chemotherapy is a form of medical treatment that employs cytotoxic drugs, meaning substances that\nare toxic to cells, to either destroy cancer cells or impede their growth and division.1 The\nfundamental principle behind chemotherapy lies in its targeting of rapidly dividing cells, a\nhallmark characteristic of cancer. However, this mechanism is not exclusively selective for\nmalignant cells. Several types of healthy cells in the body—such as those in the bone marrow\nresponsible for producing blood cells, in the hair follicles, and in the lining of the\ngastrointestinal tract—also divide rapidly. The action of chemotherapy drugs on these healthy\ntissues is what leads to the treatment's most common and challenging side effects.1"
