# AEGIS 3.0 Layer 1 Test: L1-SEM-2 Semantic Entropy
## With LLM Integration (Gemini API / Local Llama)

**Objective:** Validate semantic entropy correlates with extraction uncertainty

**Metrics:**
- Spearman ρ ≥ 0.60
- AUC-ROC ≥ 0.80

In [1]:
# Install dependencies
!pip install -q numpy scipy scikit-learn pandas google-generativeai

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import roc_auc_score
from collections import Counter
from typing import List, Dict, Tuple
import json
import re
import os

SEEDS = [42, 123, 456, 789, 1000]
np.random.seed(42)
print("Dependencies loaded!")

Dependencies loaded!


## 1. LLM Configuration
**Option A:** Gemini API (recommended for cloud)
**Option B:** Local Llama (for reproducibility)

In [3]:
# ==========================================
# PASTE YOUR GEMINI API KEY HERE
# ==========================================
GEMINI_API_KEY = "AIzaSyDZqlrlia5emyXwoTQVb1h54JyWnxRmKIs"  # <-- Paste your key here

USE_LLM = len(GEMINI_API_KEY) > 0

if USE_LLM:
    import google.generativeai as genai
    genai.configure(api_key=GEMINI_API_KEY)
    model = genai.GenerativeModel('gemini-3-pro-preview')
    print("✓ Gemini API configured")
else:
    print("⚠ No API key - using simulated LLM extraction")
    print("  This is acceptable for methodology validation")

✓ Gemini API configured


In [4]:
# Test dataset with expert-assigned ambiguity ratings (1-5)
# 1 = unambiguous, 5 = highly ambiguous
TEST_DATA = [
    # Rating 1: Clear medical terms
    {"text": "Patient diagnosed with hypoglycemia", "ambiguity": 1},
    {"text": "Blood glucose level: 54 mg/dL", "ambiguity": 1},
    {"text": "Administered 10 units of insulin", "ambiguity": 1},
    {"text": "Symptoms include nausea and headache", "ambiguity": 1},
    
    # Rating 2: Low ambiguity
    {"text": "Feeling tired after exercise", "ambiguity": 2},
    {"text": "Blood sugar seems high today", "ambiguity": 2},
    {"text": "Had some dizziness this morning", "ambiguity": 2},
    {"text": "Stress from work affecting sleep", "ambiguity": 2},
    
    # Rating 3: Moderate ambiguity  
    {"text": "Not feeling great today", "ambiguity": 3},
    {"text": "Something feels different", "ambiguity": 3},
    {"text": "Having a rough day", "ambiguity": 3},
    {"text": "My body feels weird", "ambiguity": 3},
    
    # Rating 4: High ambiguity
    {"text": "Just feeling off", "ambiguity": 4},
    {"text": "Something is wrong", "ambiguity": 4},
    {"text": "Not myself today", "ambiguity": 4},
    {"text": "Things are different somehow", "ambiguity": 4},
    
    # Rating 5: Very high ambiguity
    {"text": "Meh", "ambiguity": 5},
    {"text": "Ugh", "ambiguity": 5},
    {"text": "Whatever", "ambiguity": 5},
    {"text": "...", "ambiguity": 5},
]

# Expand to N=100 samples
EXPANDED_DATA = TEST_DATA * 5
print(f"Test dataset: {len(EXPANDED_DATA)} samples")

Test dataset: 100 samples


## 2. Semantic Entropy Calculator

In [5]:
# SNOMED-CT mapping for concept clustering
CONCEPT_MAPPING = {
    "hypoglycemia": "302866003", "hyperglycemia": "80394007",
    "blood glucose": "33747003", "glucose": "33747003",
    "insulin": "412222008", "fatigue": "84229001",
    "tired": "84229001", "headache": "25064002",
    "nausea": "422587007", "dizziness": "404640003",
    "stress": "73595000", "anxiety": "48694002",
    "malaise": "367391008", "unwell": "367391008",
    "unknown": "261665006",
}

def extract_concept_simulated(text: str, temperature: float = 0.7) -> str:
    """Simulated LLM extraction with temperature-based stochasticity"""
    text_lower = text.lower()
    
    # Find matching concepts
    matches = [k for k in CONCEPT_MAPPING.keys() if k in text_lower]
    
    if matches:
        if np.random.random() > temperature * 0.3:
            return matches[0]
        else:
            # Temperature-based diversity
            all_concepts = list(CONCEPT_MAPPING.keys())
            return np.random.choice(all_concepts)
    else:
        # Ambiguous text - high variance
        if np.random.random() < temperature:
            return np.random.choice(list(CONCEPT_MAPPING.keys()))
        return "unknown"

def extract_concept_llm(text: str, temperature: float = 0.7) -> str:
    """Real LLM extraction using Gemini"""
    prompt = f"""Extract the main medical concept from this patient text. 
    Return ONLY one word from: hypoglycemia, hyperglycemia, glucose, insulin, 
    fatigue, headache, nausea, dizziness, stress, anxiety, malaise, unknown.
    
    Text: "{text}"
    Concept:"""
    
    try:
        response = model.generate_content(
            prompt,
            generation_config={'temperature': temperature}
        )
        concept = response.text.strip().lower()
        if concept in CONCEPT_MAPPING:
            return concept
        return "unknown"
    except:
        return extract_concept_simulated(text, temperature)

def extract_concept(text: str, temperature: float = 0.7) -> str:
    """Main extraction function - uses LLM if available"""
    if USE_LLM:
        return extract_concept_llm(text, temperature)
    return extract_concept_simulated(text, temperature)

In [6]:
def compute_semantic_entropy(text: str, K: int = 10, 
                             temperatures: List[float] = [0.3, 0.5, 0.7, 0.9, 1.1]) -> float:
    """Compute semantic entropy over K candidate extractions"""
    
    # Generate K candidate extractions at varying temperatures
    concepts = []
    for _ in range(K // len(temperatures)):
        for temp in temperatures:
            concept = extract_concept(text, temp)
            snomed = CONCEPT_MAPPING.get(concept, "261665006")
            concepts.append(snomed)
    
    # Cluster by SNOMED code and compute entropy
    counts = Counter(concepts)
    total = sum(counts.values())
    
    # Shannon entropy
    entropy = 0.0
    for count in counts.values():
        p = count / total
        if p > 0:
            entropy -= p * np.log2(p)
    
    return entropy

# Test
print("Testing entropy calculation...")
for sample in TEST_DATA[:4]:
    entropy = compute_semantic_entropy(sample['text'], K=10)
    print(f"  Ambiguity={sample['ambiguity']}: H={entropy:.3f} | {sample['text'][:40]}")

Testing entropy calculation...
  Ambiguity=1: H=1.157 | Patient diagnosed with hypoglycemia
  Ambiguity=1: H=1.295 | Blood glucose level: 54 mg/dL
  Ambiguity=1: H=1.357 | Administered 10 units of insulin
  Ambiguity=1: H=1.961 | Symptoms include nausea and headache


## 3. Run Full Evaluation

In [7]:
def evaluate_entropy_calibration(test_data, K=10):
    """Evaluate correlation between entropy and ambiguity"""
    entropies = []
    ambiguities = []
    
    for sample in test_data:
        entropy = compute_semantic_entropy(sample['text'], K=K)
        entropies.append(entropy)
        ambiguities.append(sample['ambiguity'])
    
    # Spearman correlation
    rho, p_value = spearmanr(entropies, ambiguities)
    
    # AUC-ROC for detecting high ambiguity (≥4)
    high_ambiguity = [1 if a >= 4 else 0 for a in ambiguities]
    try:
        auc = roc_auc_score(high_ambiguity, entropies)
    except:
        auc = 0.5
    
    # Mean entropy by ambiguity level
    entropy_by_level = {}
    for i in range(1, 6):
        level_entropies = [e for e, a in zip(entropies, ambiguities) if a == i]
        if level_entropies:
            entropy_by_level[i] = np.mean(level_entropies)
    
    return {
        'spearman_rho': rho,
        'p_value': p_value,
        'auc_roc': auc,
        'entropy_by_level': entropy_by_level,
        'n_samples': len(test_data)
    }

# Run evaluation
print("Running entropy calibration evaluation...")
print("(This may take a few minutes with LLM)")
results = evaluate_entropy_calibration(EXPANDED_DATA, K=10)

print("\n" + "="*60)
print("L1-SEM-2: SEMANTIC ENTROPY CALIBRATION RESULTS")
print("="*60)
print(f"\nSamples: {results['n_samples']}")
print(f"\nSpearman ρ: {results['spearman_rho']:.4f} (Target: ≥0.60)")
print(f"p-value:    {results['p_value']:.6f}")
print(f"AUC-ROC:    {results['auc_roc']:.4f} (Target: ≥0.80)")
print("\nMean Entropy by Ambiguity Level:")
for level, entropy in results['entropy_by_level'].items():
    print(f"  Level {level}: {entropy:.3f}")

Running entropy calibration evaluation...
(This may take a few minutes with LLM)

L1-SEM-2: SEMANTIC ENTROPY CALIBRATION RESULTS

Samples: 100

Spearman ρ: 0.6569 (Target: ≥0.60)
p-value:    0.000000
AUC-ROC:    0.7606 (Target: ≥0.80)

Mean Entropy by Ambiguity Level:
  Level 1: 0.885
  Level 2: 1.209
  Level 3: 2.351
  Level 4: 2.211
  Level 5: 2.261


In [8]:
# Multi-seed evaluation
def run_multi_seed(seeds=SEEDS):
    all_results = []
    for seed in seeds:
        np.random.seed(seed)
        result = evaluate_entropy_calibration(EXPANDED_DATA, K=10)
        all_results.append(result)
    
    rhos = [r['spearman_rho'] for r in all_results]
    aucs = [r['auc_roc'] for r in all_results]
    
    return {
        'rho_mean': np.mean(rhos), 'rho_std': np.std(rhos),
        'auc_mean': np.mean(aucs), 'auc_std': np.std(aucs)
    }

multi_results = run_multi_seed()
print("\n" + "="*60)
print("MULTI-SEED RESULTS")
print("="*60)
print(f"Spearman ρ: {multi_results['rho_mean']:.4f} ± {multi_results['rho_std']:.4f}")
print(f"AUC-ROC:    {multi_results['auc_mean']:.4f} ± {multi_results['auc_std']:.4f}")


MULTI-SEED RESULTS
Spearman ρ: 0.6484 ± 0.0476
AUC-ROC:    0.7836 ± 0.0516


In [9]:
# Pass/Fail determination
TARGETS = {'spearman_rho': 0.60, 'auc_roc': 0.80}

passed = (
    multi_results['rho_mean'] >= TARGETS['spearman_rho'] and
    multi_results['auc_mean'] >= TARGETS['auc_roc']
)

print("\n" + "="*60)
print("TEST STATUS")
print("="*60)
print(f"Spearman ρ: {'PASS ✓' if multi_results['rho_mean'] >= TARGETS['spearman_rho'] else 'FAIL ✗'}")
print(f"AUC-ROC:    {'PASS ✓' if multi_results['auc_mean'] >= TARGETS['auc_roc'] else 'FAIL ✗'}")
print(f"\nOVERALL: {'PASS ✓' if passed else 'FAIL ✗'}")

# Save results - convert numpy types to Python native types for JSON serialization
final_results = {
    'test_id': 'L1-SEM-2',
    'test_name': 'Semantic Entropy Calibration',
    'llm_used': bool(USE_LLM),
    'spearman_rho': {'mean': float(multi_results['rho_mean']), 'std': float(multi_results['rho_std'])},
    'auc_roc': {'mean': float(multi_results['auc_mean']), 'std': float(multi_results['auc_std'])},
    'passed': bool(passed)
}
print("\nResults JSON:")
print(json.dumps(final_results, indent=2))


TEST STATUS
Spearman ρ: PASS ✓
AUC-ROC:    FAIL ✗

OVERALL: FAIL ✗

Results JSON:
{
  "test_id": "L1-SEM-2",
  "test_name": "Semantic Entropy Calibration",
  "llm_used": true,
  "spearman_rho": {
    "mean": 0.6483558731856656,
    "std": 0.04757772139300616
  },
  "auc_roc": {
    "mean": 0.783625,
    "std": 0.051636180521886836
  },
  "passed": false
}
