Project Phase 1: Stepwise API Exploration

Step 1: Import Libraries


In [None]:
!pip install requests pandas
!pip install faiss-cpu sentence-transformers numpy pandas

import requests
import pandas as pd
import json


In [None]:
from google.colab import drive
drive.mount('/content/drive')

1. Load and Filter to 5K Diabetes Records

In [None]:
# ============================================================================
# COMPLETE RAG SYSTEM FOR CLINICAL TRIALS - DIABETES SUBSET (5K)
# Final Version with Visualizations
# ============================================================================

# SECTION 1: Import All Libraries
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# SECTION 2: Load Data
print("="*80)
print("üìÅ LOADING DATA")
print("="*80)
df_diabetes = pd.read_csv('/content/drive/MyDrive/Sem 1/LLM/Project/data/clinical_trials_diabetes_full.csv')
df_test = df_diabetes.head(5000)
print(f"‚úÖ Loaded {len(df_test)} diabetes trial records")
print(f"Columns: {list(df_test.columns)}")


In [None]:
print(df_test.columns)

In [None]:
print(df_test.head(10))

In [None]:
1

In [None]:
# SECTION 3: Chunk Dataset
print("\n" + "="*80)
print("üî™ CHUNKING DATA")
print("="*80)
chunks = []
chunk_map = []

for idx, row in df_test.iterrows():
    # Brief summaries
    summary = str(row.get('brief_summary', '')).strip()
    if summary and len(summary) > 50:
        chunks.append(summary)
        chunk_map.append({
            'doc_idx': idx,
            'field': 'brief_summary',
            'chunk': summary,
            'nct_id': row['nct_id'],
            'title': row['brief_title'],
            'conditions': row['conditions'],
            'status': row.get('status', 'UNKNOWN')
        })

    # Interventions
    interventions = str(row.get('interventions', '')).strip()
    if interventions and len(interventions) > 20:
        chunks.append(f"Interventions: {interventions}")
        chunk_map.append({
            'doc_idx': idx,
            'field': 'interventions',
            'chunk': f"Interventions: {interventions}",
            'nct_id': row['nct_id'],
            'title': row['brief_title'],
            'conditions': row['conditions'],
            'status': row.get('status', 'UNKNOWN')
        })

print(f"‚úÖ Created {len(chunks)} complete chunks")

# SECTION 4: Embed and Index
print("\n" + "="*80)
print("üî¢ EMBEDDING & INDEXING")
print("="*80)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(chunks, show_progress_bar=True)
print(f"‚úÖ Embeddings shape: {embeddings.shape}")

faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index.add(np.array(embeddings))
print(f"‚úÖ FAISS index ready with {faiss_index.ntotal} chunks")

# SECTION 5: Single Query Demo
print("\n" + "="*80)
print("üîç SINGLE QUERY DEMO")
print("="*80)
query = "What are new treatments for type 2 diabetes?"
query_embedding = embedding_model.encode([query])
k = 5
D, I = faiss_index.search(query_embedding, k)

print(f"Query: {query}\n")
print("üìã RETRIEVED CLINICAL TRIAL EVIDENCE:\n")

single_query_results = []
for i, idx in enumerate(I[0]):
    info = chunk_map[idx]
    print(f"{i+1}. **{info['title']}** (NCT: {info['nct_id']})")
    print(f"   üìÑ {info['chunk'][:300].strip()}...")
    print()

    single_query_results.append({
        'query': query,
        'nct_id': info['nct_id'],
        'title': info['title'],
        'relevance_score': float(D[0][i]),
        'evidence': info['chunk'][:500]
    })

print("="*80)
print("üí° SYNTHESIZED ANSWER:")
print("="*80)
print("Based on the retrieved clinical trials, new treatments for type 2 diabetes include:")
print(f"‚Ä¢ Polyherbal formulations combined with metformin ({chunk_map[I[0][0]]['nct_id']})")
print(f"‚Ä¢ Novel therapies for postprandial glucose control ({chunk_map[I[0][1]]['nct_id']})")
print(f"‚Ä¢ Second-line anti-diabetes treatments in real-world settings ({chunk_map[I[0][2]]['nct_id']})")
print("\nAll retrieved trials are directly relevant to type 2 diabetes treatment.")
print("="*80)

# Save single query results
pd.DataFrame(single_query_results).to_csv('/content/drive/MyDrive/rag_demo_results.csv', index=False)

# SECTION 6: Multiple Query Tests
print("\n" + "="*80)
print("üî¨ RUNNING MULTIPLE QUERY TESTS")
print("="*80)

queries = [
    "What are the eligibility criteria for diabetes clinical trials?",
    "Which trials study insulin treatments?",
    "What are the primary outcomes measured in diabetes research?"
]

all_results = []
query_log = []

for query_idx, query in enumerate(queries, 1):
    print(f"\n{'='*80}")
    print(f"üîç QUERY {query_idx}: {query}")
    print("="*80)

    query_embedding = embedding_model.encode([query])
    D, I = faiss_index.search(query_embedding, k)

    print("\nüìã RETRIEVED EVIDENCE:\n")
    for i, idx in enumerate(I[0]):
        info = chunk_map[idx]
        print(f"{i+1}. **{info['title']}** (NCT: {info['nct_id']})")
        print(f"   üìÑ {info['chunk'][:250].strip()}...")
        print()

        all_results.append({
            'query_num': query_idx,
            'query': query,
            'rank': i+1,
            'nct_id': info['nct_id'],
            'title': info['title'],
            'field': info['field'],
            'relevance_score': float(D[0][i]),
            'evidence_snippet': info['chunk'][:500]
        })

    query_log.append({
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'query': query,
        'num_results': k,
        'top_nct_id': chunk_map[I[0][0]]['nct_id'],
        'avg_relevance_score': float(D[0].mean())
    })

    print("üí° SYNTHESIS:")
    print("-" * 80)
    top_trials = [chunk_map[I[0][i]]['nct_id'] for i in range(min(3, len(I[0])))]
    print(f"Retrieved {k} relevant trials. Top: {', '.join(top_trials)}")
    print()

# SECTION 7: Save Results
results_df = pd.DataFrame(all_results)
log_df = pd.DataFrame(query_log)
results_df.to_csv('/content/drive/MyDrive/rag_multi_query_results.csv', index=False)
log_df.to_csv('/content/drive/MyDrive/rag_query_log.csv', index=False)

# SECTION 8: Statistics
print("\n" + "="*80)
print("üìä SUMMARY STATISTICS")
print("="*80)
print(f"Total queries tested: {len(queries)}")
print(f"Total results retrieved: {len(all_results)}")
print(f"Unique trials found: {results_df['nct_id'].nunique()}")
print(f"Average relevance score: {results_df['relevance_score'].mean():.4f}")
print(f"Fields retrieved from: {results_df['field'].value_counts().to_dict()}")

print("\nüìà QUERY PERFORMANCE:")
print("-" * 80)
for idx, row in log_df.iterrows():
    print(f"Query {idx+1}: {row['query'][:50]}...")
    print(f"  Top Result: {row['top_nct_id']}")
    print(f"  Avg Score: {row['avg_relevance_score']:.4f}")
    print()

# SECTION 9: Visualizations
print("\n" + "="*80)
print("üìä GENERATING VISUALIZATIONS")
print("="*80)

plt.figure(figsize=(12, 5))

# Chart 1: Relevance Score by Query
plt.subplot(1, 2, 1)
for q_num in results_df['query_num'].unique():
    data = results_df[results_df['query_num'] == q_num]['relevance_score']
    plt.plot(range(1, len(data)+1), data, marker='o', label=f'Query {q_num}')
plt.xlabel('Rank')
plt.ylabel('Relevance Score')
plt.title('Retrieval Relevance by Query and Rank')
plt.legend()
plt.grid(alpha=0.3)

# Chart 2: Field Distribution
plt.subplot(1, 2, 2)
field_counts = results_df['field'].value_counts()
plt.bar(field_counts.index, field_counts.values, color=['#1f77b4', '#ff7f0e'])
plt.xlabel('Field')
plt.ylabel('Count')
plt.title('Retrieved Results by Field Type')

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/rag_performance_charts.png', dpi=300, bbox_inches='tight')
print("‚úÖ Charts saved to Drive!")
plt.show()

# SECTION 10: Summary Table
print("\nüìä PRESENTATION SUMMARY TABLE:")
print("="*80)
summary_table = log_df[['query', 'top_nct_id', 'avg_relevance_score']].copy()
summary_table.columns = ['Query', 'Top Result (NCT)', 'Avg Relevance']
summary_table['Avg Relevance'] = summary_table['Avg Relevance'].round(3)
print(summary_table.to_string(index=False))

# SECTION 11: Final Summary
print("\n" + "="*80)
print("üéâ RAG PIPELINE COMPLETE")
print("="*80)
print(f"""
FINAL SYSTEM SUMMARY:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
üìÅ Dataset: 5,000 diabetes clinical trials
üî™ Chunks: {len(chunks)} semantic segments
üî¢ Embedding: all-MiniLM-L6-v2 (384-dimensional)
üóÇÔ∏è Index: FAISS L2 similarity search
üîç Queries tested: {len(queries) + 1} (1 demo + 3 evaluation)
üìä Avg relevance: {results_df['relevance_score'].mean():.4f}
üéØ Unique trials: {results_df['nct_id'].nunique()}
üíæ Files saved: 4 (results, logs, charts, demo)

‚úÖ All components validated and working
‚úÖ Results saved to Google Drive
‚úÖ Visualizations generated
‚úÖ Ready for presentation and scaling
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
""")

print("üìÅ FILES CREATED:")
print("  1. rag_demo_results.csv - Single query demo results")
print("  2. rag_multi_query_results.csv - Multi-query detailed results")
print("  3. rag_query_log.csv - Query performance log")
print("  4. rag_performance_charts.png - Visualization charts")
print("="*80)


In [None]:
1

Next Step: Build the 4 Core Agents

Agent 1: SymptomParser

gemini - gmail - llm 2
AIzaSyBzkX3f3eIrdyCxzBplY2SWFMpiHEAp_Fo

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !pip install requests pandas
# !pip install faiss-cpu sentence-transformers numpy pandas

# import requests
# import pandas as pd
# import json


In [None]:
# import pandas as pd
# from sentence_transformers import SentenceTransformer
# import faiss
# import numpy as np

# # Load your saved diabetes data
# df = pd.read_csv("/content/drive/MyDrive/Sem 1/LLM/Project/data/clinical_trials_diabetes_full.csv")  # Replace with your file
# df_small = df.head(5000)  # 5K subset

# print(f"Loaded {len(df_small)} records")
# print(df_small.columns.tolist())


In [None]:
# Chunking
chunk_size = 300
chunk_overlap = 50
chunks = []
chunk_map = []

for idx, row in df_test.iterrows():
    text = f"{row['brief_title']}. {row['brief_summary']}"

    # Split into chunks
    for i in range(0, len(text), chunk_size - chunk_overlap):
        chunk = text[i:i + chunk_size]
        if len(chunk) > 50:  # Skip tiny chunks
            chunks.append(chunk)
            chunk_map.append({
                'nct_id': row['nct_id'],
                'chunk_text': chunk,
                'original_idx': idx
            })

print(f"Created {len(chunks)} chunks")

# Embed chunks
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_model.encode(chunks, show_progress_bar=True)

# Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings.astype('float32'))

print(f"FAISS index ready with {index.ntotal} vectors")


In [None]:
# Install latest version
!pip install -q google-generativeai

In [None]:
# Install and configure
!pip install -q google-generativeai

import google.generativeai as genai

# Configure with your API key
genai.configure(api_key="AIzaSyBzkX3f3eIrdyCxzBplY2SWFMpiHEAp_Fo")

# List ALL available models
print("Available models for generateContent:\n")
for model in genai.list_models():
    if 'generateContent' in model.supported_generation_methods:
        print(f"‚úÖ {model.name}")


In [None]:
import google.generativeai as genai
import json
import re
from datetime import datetime

genai.configure(api_key="AIzaSyBzkX3f3eIrdyCxzBplY2SWFMpiHEAp_Fo")

class SymptomParser:
    def __init__(self):
        self.model = genai.GenerativeModel('models/gemini-2.0-flash')
        print("SymptomParser initialized")

    def parse(self, user_input):
        prompt = f"""Extract medical information from this text and respond with ONLY a valid JSON object:

Input: "{user_input}"

Format: {{"symptoms": ["list"], "duration": "text or null", "severity": "text or null", "context": "text or null"}}"""

        try:
            response = self.model.generate_content(prompt)
            raw_text = response.text.strip()

            print(f"DEBUG: {raw_text[:150]}")

            # Find JSON object in response
            json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', raw_text, re.DOTALL)
            if json_match:
                text = json_match.group(0)
            else:
                text = raw_text

            parsed = json.loads(text)

        except Exception as e:
            print(f"Error: {e}")
            parsed = {
                "symptoms": [user_input],
                "duration": None,
                "severity": None,
                "context": None
            }

        parsed["_agent"] = "SymptomParser"
        parsed["_timestamp"] = datetime.now().isoformat()

        return parsed

# Test
parser = SymptomParser()

test1 = parser.parse("I have diabetes and high blood sugar for 2 weeks")
print(json.dumps(test1, indent=2))

test2 = parser.parse("severe headache, fever for 3 days")
print(json.dumps(test2, indent=2))


In [None]:
1

Agent #2: RetrievalAgent - Connect to Your 5K Dataset

In [None]:
import numpy as np

class RetrievalAgent:
    """Retrieve relevant clinical trials based on symptoms"""

    def __init__(self, embed_model, faiss_index, chunk_map):
        self.embed_model = embed_model
        self.index = faiss_index
        self.chunk_map = chunk_map
        print("RetrievalAgent initialized")

    def retrieve(self, parsed_symptoms, top_k=5):
        """
        Retrieve clinical trials relevant to symptoms

        Args:
            parsed_symptoms: Output from SymptomParser
            top_k: Number of trials to retrieve

        Returns:
            dict with retrieved trials and metadata
        """

        # Build query from symptoms
        symptoms = parsed_symptoms.get("symptoms", [])
        context = parsed_symptoms.get("context", "")

        query_text = " ".join(symptoms)
        if context:
            query_text += f" {context}"

        print(f"Query: {query_text}")

        # Embed the query
        query_embedding = self.embed_model.encode([query_text])

        # Search FAISS index
        distances, indices = self.index.search(
            query_embedding.astype('float32'),
            top_k
        )

        # Gather retrieved trials
        retrieved = []
        seen_nct_ids = set()

        for idx in indices[0]:
            chunk_info = self.chunk_map[idx]
            nct_id = chunk_info['nct_id']

            # Avoid duplicate trials
            if nct_id not in seen_nct_ids:
                retrieved.append({
                    'nct_id': nct_id,
                    'text': chunk_info['chunk_text'],
                    'relevance_score': float(distances[0][len(retrieved)])
                })
                seen_nct_ids.add(nct_id)

            if len(retrieved) >= top_k:
                break

        return {
            '_agent': 'RetrievalAgent',
            '_timestamp': datetime.now().isoformat(),
            'query': query_text,
            'num_retrieved': len(retrieved),
            'trials': retrieved
        }


# Initialize RetrievalAgent (using your existing data)
# Make sure you have: embed_model, index, chunk_map from earlier

retrieval_agent = RetrievalAgent(embed_model, index, chunk_map)

# Test it with SymptomParser output
parsed = parser.parse("I have type 2 diabetes and high blood sugar")
retrieved = retrieval_agent.retrieve(parsed, top_k=3)

print("\n" + "="*60)
print("RETRIEVAL RESULTS")
print("="*60)
print(f"Query: {retrieved['query']}")
print(f"Retrieved {retrieved['num_retrieved']} trials:\n")

for i, trial in enumerate(retrieved['trials'], 1):
    print(f"{i}. NCT ID: {trial['nct_id']}")
    print(f"   Text: {trial['text'][:150]}...")
    print(f"   Score: {trial['relevance_score']:.4f}\n")


In [None]:
1

Agent #3: DiagnosisAdvisor

In [None]:
class DiagnosisAdvisor:
    """Generate health recommendations based on retrieved clinical evidence"""

    def __init__(self, gemini_model):
        self.model = gemini_model
        print("DiagnosisAdvisor initialized")

    def advise(self, parsed_symptoms, retrieved_trials):
        """
        Generate evidence-based recommendations

        Args:
            parsed_symptoms: Output from SymptomParser
            retrieved_trials: Output from RetrievalAgent

        Returns:
            dict with recommendations and evidence
        """

        # Build context from retrieved trials
        context = ""
        for i, trial in enumerate(retrieved_trials['trials'], 1):
            context += f"\nTrial {i} (NCT ID: {trial['nct_id']}):\n{trial['text']}\n"

        # Build prompt
        symptoms_str = ", ".join(parsed_symptoms.get("symptoms", []))
        duration = parsed_symptoms.get("duration", "unknown duration")

        prompt = f"""You are a medical advisor AI. Based on clinical trial evidence, provide recommendations.

Patient symptoms: {symptoms_str}
Duration: {duration}

Clinical trial evidence:
{context}

Provide a response with:
1. Summary of relevant findings from the trials
2. Recommended actions (consult doctor, lifestyle changes, etc.)
3. Important considerations

Keep response professional, evidence-based, and helpful. Do NOT diagnose."""

        try:
            response = self.model.generate_content(prompt)
            advice_text = response.text.strip()

        except Exception as e:
            print(f"Error: {e}")
            advice_text = "Unable to generate recommendations. Please consult a healthcare provider."

        return {
            '_agent': 'DiagnosisAdvisor',
            '_timestamp': datetime.now().isoformat(),
            'symptoms': parsed_symptoms.get('symptoms'),
            'num_trials_cited': len(retrieved_trials['trials']),
            'recommendation': advice_text,
            'cited_trials': [t['nct_id'] for t in retrieved_trials['trials']]
        }


# Initialize DiagnosisAdvisor
diagnosis_agent = DiagnosisAdvisor(parser.model)

# Generate recommendation
advice = diagnosis_agent.advise(parsed, retrieved)

print("\n" + "="*60)
print("DIAGNOSIS & RECOMMENDATIONS")
print("="*60)
print(f"Symptoms: {', '.join(advice['symptoms'])}")
print(f"Evidence from {advice['num_trials_cited']} clinical trials")
print(f"Cited: {', '.join(advice['cited_trials'])}\n")
print("Recommendation:")
print(advice['recommendation'])


Agent #4: SafetyFilter

In [None]:
class SafetyFilter:
    """Add medical disclaimers and safety checks"""

    def __init__(self):
        print("SafetyFilter initialized")

    def filter(self, advice):
        """Add safety disclaimer to recommendations"""

        disclaimer = """

‚ö†Ô∏è IMPORTANT MEDICAL DISCLAIMER:
This information is for educational purposes only and is based on clinical trial data.
It is NOT a substitute for professional medical advice, diagnosis, or treatment.
Always consult a qualified healthcare provider for medical concerns.
"""

        # Add disclaimer to recommendation
        advice['recommendation'] = advice['recommendation'] + disclaimer
        advice['_safety_filtered'] = True
        advice['_filter_timestamp'] = datetime.now().isoformat()

        return advice


# Initialize SafetyFilter
safety_filter = SafetyFilter()

# Apply safety filter
final_output = safety_filter.filter(advice)

print("\n" + "="*60)
print("FINAL OUTPUT (WITH SAFETY FILTER)")
print("="*60)
print(final_output['recommendation'])
