# AEGIS 3.0 Layer 1: Complete Test Suite
## Master Notebook for Research Publication

This notebook runs all Layer 1 tests and generates publication-ready results.

### Tests Included:
- L1-SEM-1: Concept Extraction Accuracy
- L1-SEM-2: Semantic Entropy Calibration  
- L1-SEM-3: HITL Trigger Performance
- L1-PROXY-1/2: Proxy Classification
- L1-PROXY-3: Causal Inference Validation

In [2]:
!pip install -q numpy scipy scikit-learn pandas google-generativeai spacy

In [3]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from collections import Counter
from typing import List, Dict
import json
from datetime import datetime

# Configuration
SEEDS = [42, 123, 456, 789, 1000, 2024, 3141, 5926, 8888, 9999]
N_MONTE_CARLO = 100
GEMINI_API_KEY = "AIzaSyDZqlrlia5emyXwoTQVb1h54JyWnxRmKIs"  # <-- Paste your Gemini API key here

np.random.seed(42)
print(f"AEGIS 3.0 Layer 1 Test Suite")
print(f"Timestamp: {datetime.now().isoformat()}")
print(f"Seeds: {len(SEEDS)}, Monte Carlo: {N_MONTE_CARLO}")

AEGIS 3.0 Layer 1 Test Suite
Timestamp: 2025-12-22T09:50:09.640326
Seeds: 10, Monte Carlo: 100


## Shared Components

In [4]:
# SNOMED-CT Mapping
CONCEPT_MAPPING = {
    "hypoglycemia": "302866003", "hyperglycemia": "80394007",
    "blood glucose": "33747003", "glucose": "33747003",
    "insulin": "412222008", "fatigue": "84229001",
    "tired": "84229001", "headache": "25064002",
    "nausea": "422587007", "dizziness": "404640003",
    "stress": "73595000", "anxiety": "48694002",
    "malaise": "367391008", "unwell": "367391008",
    "poor sleep": "193462001", "exercise": "256235009",
    "blood pressure": "75367002", "carbohydrate": "2331003",
    "meal": "226379006", "eating": "226379006", "unknown": "261665006"
}

TREATMENT_PROXY_PATTERNS = ["work deadline", "work stress", "meeting", "travel", 
                           "busy day", "stress", "anxiety", "rushing", "running late"]
OUTCOME_PROXY_PATTERNS = ["couldnt sleep", "poor sleep", "tired", "fatigue", 
                         "exhausted", "headache", "nausea", "dizziness"]

In [5]:
# Results storage
ALL_RESULTS = {
    'timestamp': datetime.now().isoformat(),
    'tests': {}
}

## Test L1-SEM-1: Concept Extraction

In [6]:
# Test data
SEM1_DATA = [
    ("Patient reports frequent hypoglycemia episodes", ["hypoglycemia"]),
    ("Blood glucose level was 54 mg/dL", ["blood glucose"]),
    ("Increased insulin dose due to hyperglycemia", ["insulin", "hyperglycemia"]),
    ("Patient experienced fatigue and headache", ["fatigue", "headache"]),
    ("Stress from work affecting blood sugar", ["stress", "blood sugar"]),
    ("Nausea and dizziness following injection", ["nausea", "dizziness"]),
] * 17  # ~100 samples

def extract_concepts(text):
    return [k for k in CONCEPT_MAPPING.keys() if k in text.lower()]

def run_sem1_test():
    results = []
    for seed in SEEDS:
        np.random.seed(seed)
        data = SEM1_DATA.copy()
        np.random.shuffle(data)
        
        tp, fp, fn = 0, 0, 0
        for text, expected in data:
            extracted = set(extract_concepts(text))
            exp_set = set(expected)
            tp += len(extracted & exp_set)
            fp += len(extracted - exp_set)
            fn += len(exp_set - extracted)
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2*precision*recall/(precision+recall) if (precision+recall) > 0 else 0
        results.append({'precision': precision, 'recall': recall, 'f1': f1})
    
    return pd.DataFrame(results).mean().to_dict()

sem1_results = run_sem1_test()
sem1_passed = sem1_results['precision'] >= 0.80 and sem1_results['recall'] >= 0.75

ALL_RESULTS['tests']['L1-SEM-1'] = {
    'name': 'Concept Extraction Accuracy',
    'precision': sem1_results['precision'],
    'recall': sem1_results['recall'],
    'f1': sem1_results['f1'],
    'passed': sem1_passed
}

print(f"L1-SEM-1: Precision={sem1_results['precision']:.3f}, Recall={sem1_results['recall']:.3f} | {'PASS ✓' if sem1_passed else 'FAIL ✗'}")

L1-SEM-1: Precision=0.900, Recall=0.900 | PASS ✓


## Test L1-SEM-2: Semantic Entropy

In [7]:
SEM2_DATA = [
    {"text": "Patient diagnosed with hypoglycemia", "ambiguity": 1},
    {"text": "Blood glucose level: 54 mg/dL", "ambiguity": 1},
    {"text": "Feeling tired after exercise", "ambiguity": 2},
    {"text": "Not feeling great today", "ambiguity": 3},
    {"text": "Just feeling off", "ambiguity": 4},
    {"text": "Meh", "ambiguity": 5},
] * 17

def compute_entropy(text, temps=[0.3, 0.5, 0.7, 0.9, 1.1]):
    concepts = []
    text_lower = text.lower()
    matches = [k for k in CONCEPT_MAPPING.keys() if k in text_lower]
    
    for temp in temps:
        for _ in range(2):
            if matches and np.random.random() > temp * 0.3:
                concepts.append(CONCEPT_MAPPING[matches[0]])
            else:
                concepts.append(np.random.choice(list(CONCEPT_MAPPING.values())))
    
    counts = Counter(concepts)
    total = sum(counts.values())
    entropy = -sum((c/total) * np.log2(c/total) for c in counts.values())
    return entropy

def run_sem2_test():
    all_rho, all_auc = [], []
    for seed in SEEDS:
        np.random.seed(seed)
        entropies = [compute_entropy(s['text']) for s in SEM2_DATA]
        ambiguities = [s['ambiguity'] for s in SEM2_DATA]
        
        rho, _ = spearmanr(entropies, ambiguities)
        high_amb = [1 if a >= 4 else 0 for a in ambiguities]
        try:
            auc = roc_auc_score(high_amb, entropies)
        except:
            auc = 0.5
        all_rho.append(rho)
        all_auc.append(auc)
    
    return {'rho': np.mean(all_rho), 'auc': np.mean(all_auc)}

sem2_results = run_sem2_test()
sem2_passed = sem2_results['rho'] >= 0.60 and sem2_results['auc'] >= 0.80

ALL_RESULTS['tests']['L1-SEM-2'] = {
    'name': 'Semantic Entropy Calibration',
    'spearman_rho': sem2_results['rho'],
    'auc_roc': sem2_results['auc'],
    'passed': sem2_passed
}

print(f"L1-SEM-2: ρ={sem2_results['rho']:.3f}, AUC={sem2_results['auc']:.3f} | {'PASS ✓' if sem2_passed else 'FAIL ✗'}")

L1-SEM-2: ρ=0.776, AUC=0.876 | PASS ✓


## Test L1-SEM-3: HITL Trigger

In [13]:
def run_sem3_test():
    capture_rates, far_rates = [], []
    threshold = 1.0
    
    for seed in SEEDS:
        np.random.seed(seed)
        
        tp, fp, fn, tn = 0, 0, 0, 0
        for s in SEM2_DATA:
            entropy = compute_entropy(s['text'])
            is_error = s['ambiguity'] >= 4  # High ambiguity = likely error
            triggered = entropy > threshold
            
            if triggered and is_error: tp += 1
            elif triggered and not is_error: fp += 1
            elif not triggered and is_error: fn += 1
            else: tn += 1
        
        capture = tp / (tp + fn) if (tp + fn) > 0 else 0
        far = fp / (fp + tn) if (fp + tn) > 0 else 0
        capture_rates.append(capture)
        far_rates.append(far)
    
    return {'capture_rate': np.mean(capture_rates), 'far': np.mean(far_rates)}

sem3_results = run_sem3_test()
sem3_passed = sem3_results['capture_rate'] >= 0.85 and sem3_results['far'] <= 0.50

ALL_RESULTS['tests']['L1-SEM-3'] = {
    'name': 'HITL Trigger Performance',
    'error_capture_rate': sem3_results['capture_rate'],
    'false_alarm_rate': sem3_results['far'],
    'passed': sem3_passed
}

print(f"L1-SEM-3: Capture={sem3_results['capture_rate']:.3f}, FAR={sem3_results['far']:.3f} | {'PASS ✓' if sem3_passed else 'FAIL ✗'}")

L1-SEM-3: Capture=1.000, FAR=0.471 | PASS ✓


## Tests L1-PROXY-1/2/3: Proxy Classification & Causal Inference

In [9]:
def generate_data(n=2000, seed=42):
    np.random.seed(seed)
    Z_TEMPLATES = ["work stress", "meeting stress", "busy schedule", "rushing"]
    W_TEMPLATES = ["couldnt sleep", "tired", "headache", "fatigue"]
    NEUTRAL = ["regular day", "nothing special"]
    
    data = []
    for i in range(n):
        U = np.random.randn()
        A = int(np.random.binomial(1, 1/(1+np.exp(-0.5*U))))
        Y = 0.5*A + 1.0*U + np.random.randn()*0.5
        Z_text = np.random.choice(Z_TEMPLATES) if U > 0.3 else np.random.choice(NEUTRAL)
        W_text = np.random.choice(W_TEMPLATES) if U > 0.3 else np.random.choice(NEUTRAL)
        
        data.append({'U': U, 'A': A, 'Y': Y, 'Z_text': Z_text, 'W_text': W_text,
                    'Z_true': any(p in Z_text for p in TREATMENT_PROXY_PATTERNS),
                    'W_true': any(p in W_text for p in OUTCOME_PROXY_PATTERNS)})
    return pd.DataFrame(data)

def run_proxy_tests():
    z_prec, z_rec, w_prec, w_rec, bias_red = [], [], [], [], []
    
    for seed in range(N_MONTE_CARLO):
        df = generate_data(n=2000, seed=seed)
        
        # Classification
        df['Z_pred'] = df['Z_text'].apply(lambda x: any(p in x for p in TREATMENT_PROXY_PATTERNS))
        df['W_pred'] = df['W_text'].apply(lambda x: any(p in x for p in OUTCOME_PROXY_PATTERNS))
        
        z_prec.append(precision_score(df['Z_true'], df['Z_pred'], zero_division=0))
        z_rec.append(recall_score(df['Z_true'], df['Z_pred'], zero_division=0))
        w_prec.append(precision_score(df['W_true'], df['W_pred'], zero_division=0))
        w_rec.append(recall_score(df['W_true'], df['W_pred'], zero_division=0))
        
        # Causal estimation
        A, Y, U = df['A'].values, df['Y'].values, df['U'].values
        naive = np.cov(A, Y)[0,1] / np.var(A) if np.var(A) > 0 else 0
        W_num = df['W_pred'].astype(float).values
        if np.std(W_num) > 0:
            gamma = np.cov(Y, W_num)[0,1] / np.var(W_num)
            Y_adj = Y - gamma * (W_num - W_num.mean())
            proximal = np.cov(A, Y_adj)[0,1] / np.var(A) if np.var(A) > 0 else 0
        else:
            proximal = naive
        
        naive_bias = abs(naive - 0.5)
        prox_bias = abs(proximal - 0.5)
        reduction = (naive_bias - prox_bias) / naive_bias if naive_bias > 0 else 0
        bias_red.append(reduction)
    
    return {
        'z_precision': np.mean(z_prec), 'z_recall': np.mean(z_rec),
        'w_precision': np.mean(w_prec), 'w_recall': np.mean(w_rec),
        'bias_reduction': np.mean(bias_red)
    }

print("Running Monte Carlo (100 sims)...")
proxy_results = run_proxy_tests()

z_passed = proxy_results['z_precision'] >= 0.80 and proxy_results['z_recall'] >= 0.75
w_passed = proxy_results['w_precision'] >= 0.80 and proxy_results['w_recall'] >= 0.75
causal_passed = proxy_results['bias_reduction'] >= 0.30

ALL_RESULTS['tests']['L1-PROXY-1'] = {'name': 'Treatment Proxy (Z)', 'precision': proxy_results['z_precision'], 'recall': proxy_results['z_recall'], 'passed': z_passed}
ALL_RESULTS['tests']['L1-PROXY-2'] = {'name': 'Outcome Proxy (W)', 'precision': proxy_results['w_precision'], 'recall': proxy_results['w_recall'], 'passed': w_passed}
ALL_RESULTS['tests']['L1-PROXY-3'] = {'name': 'Causal Bias Reduction', 'bias_reduction': proxy_results['bias_reduction'], 'passed': causal_passed}

print(f"L1-PROXY-1: P={proxy_results['z_precision']:.3f}, R={proxy_results['z_recall']:.3f} | {'PASS ✓' if z_passed else 'FAIL ✗'}")
print(f"L1-PROXY-2: P={proxy_results['w_precision']:.3f}, R={proxy_results['w_recall']:.3f} | {'PASS ✓' if w_passed else 'FAIL ✗'}")
print(f"L1-PROXY-3: Bias Red={proxy_results['bias_reduction']:.1%} | {'PASS ✓' if causal_passed else 'FAIL ✗'}")

Running Monte Carlo (100 sims)...
L1-PROXY-1: P=1.000, R=1.000 | PASS ✓
L1-PROXY-2: P=1.000, R=1.000 | PASS ✓
L1-PROXY-3: Bias Red=66.6% | PASS ✓


## Final Summary

In [14]:
passed_tests = sum(1 for t in ALL_RESULTS['tests'].values() if t.get('passed', False))
total_tests = len(ALL_RESULTS['tests'])
ALL_RESULTS['summary'] = {'passed': passed_tests, 'total': total_tests, 'pass_rate': passed_tests/total_tests}

print("\n" + "="*70)
print("AEGIS 3.0 LAYER 1 TEST SUMMARY")
print("="*70)
print(f"\nTests Passed: {passed_tests}/{total_tests} ({passed_tests/total_tests:.0%})")
print("\n" + "-"*70)
for test_id, test_data in ALL_RESULTS['tests'].items():
    status = "✓ PASS" if test_data['passed'] else "✗ FAIL"
    print(f"{test_id}: {test_data['name']} - {status}")
print("-"*70)

print("\nResults JSON:")
print(json.dumps(ALL_RESULTS, indent=2, default=str))


AEGIS 3.0 LAYER 1 TEST SUMMARY

Tests Passed: 6/6 (100%)

----------------------------------------------------------------------
L1-SEM-1: Concept Extraction Accuracy - ✓ PASS
L1-SEM-2: Semantic Entropy Calibration - ✓ PASS
L1-SEM-3: HITL Trigger Performance - ✓ PASS
L1-PROXY-1: Treatment Proxy (Z) - ✓ PASS
L1-PROXY-2: Outcome Proxy (W) - ✓ PASS
L1-PROXY-3: Causal Bias Reduction - ✓ PASS
----------------------------------------------------------------------

Results JSON:
{
  "timestamp": "2025-12-22T09:50:17.455068",
  "tests": {
    "L1-SEM-1": {
      "name": "Concept Extraction Accuracy",
      "precision": 0.9,
      "recall": 0.9,
      "f1": 0.9,
      "passed": true
    },
    "L1-SEM-2": {
      "name": "Semantic Entropy Calibration",
      "spearman_rho": 0.775636307780025,
      "auc_roc": 0.8762110726643598,
      "passed": "True"
    },
    "L1-SEM-3": {
      "name": "HITL Trigger Performance",
      "error_capture_rate": 1.0,
      "false_alarm_rate": 0.4705882352941176