# Stress Tests: Push the System to Its Limits

Tests:
1. Scale to 10 people (3x more)
2. Add 50% noise facts
3. Contradictory facts in same interview
4. Rapid context switching

In [None]:
# Cell 1: Install Dependencies
!pip install unsloth transformers datasets trl google-generativeai sentence-transformers scikit-learn -q
print("‚úÖ Dependencies installed")

In [None]:
# Cell 2: Imports
import sys
from pathlib import Path

# Add scripts to path
scripts_dir = Path.cwd() / "scripts"
if scripts_dir.exists():
    sys.path.insert(0, str(scripts_dir))

from scripts.training.hippocampus import create_hippocampus
from scripts.utilities.data_loader import load_people_data

print("‚úÖ Modules imported")

## Test 1: Scale to 10 People

In [None]:
# Test 1: Scale to 10 People
print("="*70)
print("STRESS TEST 1: Scale to 10 People")
print("="*70)

# Load extended people dataset
# Note: Would need to create configs/people_data_extended.yaml with 10 people
# PEOPLE_EXTENDED = load_people_data("configs/people_data_extended.yaml")  # 10 people
# print(f"‚úÖ Loaded {len(PEOPLE_EXTENDED)} people")

# Generate training data
# training_data_10 = generate_training_data(PEOPLE_EXTENDED)
# print(f"‚úÖ Generated {len(training_data_10)} examples")

# Train
# print("\nüöÄ Training on 10 people...")
# scores_10 = train_and_evaluate(training_data_10, PEOPLE_EXTENDED)

# Expected: More interference, lower scores
# print(f"\nüìä Results:")
# print(f"   Average score: {scores_10['overall']:.1%}")
# print(f"   Interference events: {check_interference(PEOPLE_EXTENDED)}")

print("‚ö†Ô∏è Add extended people data and training code to run this test")
print("   Expected: More interference, lower scores with 10 people")

## Test 2: 50% Noise Facts

In [None]:
# Test 2: Noise Facts
print("\n" + "="*70)
print("STRESS TEST 2: 50% Noise Facts")
print("="*70)

# Add random noise facts
# training_data_noisy = add_noise_facts(training_data, noise_ratio=0.5)
# print(f"‚úÖ Added noise: {len(training_data_noisy)} total examples")

# Train
# print("\nüöÄ Training with noise...")
# scores_noisy = train_and_evaluate(training_data_noisy)

# Expected: Hippocampus should reject most noise
# hippocampus.print_stats()
# print(f"\nüìä Results:")
# print(f"   Rejection rate: {hippocampus.stats['rejected'] / hippocampus.stats['total_processed']:.1%}")
# print(f"   Final score: {scores_noisy['overall']:.1%}")

print("‚ö†Ô∏è Add noise generation and training code to run this test")
print("   Expected: Hippocampus should reject most noise facts")

## Test 3: Contradictory Facts

In [None]:
# Test 3: Contradictory Facts
print("\n" + "="*70)
print("STRESS TEST 3: Contradictory Facts")
print("="*70)

# Load people data
PEOPLE = load_people_data("configs/people_data.yaml")

# Generate contradictory data
contradictory_data = [
    {"person": PEOPLE[0], "fact": "I was born in 1961."},
    {"person": PEOPLE[0], "fact": "I was born in 1867."},  # Contradiction!
    {"person": PEOPLE[0], "fact": "I won the Nobel Prize in 2009."},
    {"person": PEOPLE[0], "fact": "I won the Nobel Prize in 1903."},  # Contradiction!
]

print(f"‚úÖ Generated {len(contradictory_data)} contradictory examples")

# Initialize hippocampus (would need teacher_model)
# hippocampus = create_hippocampus(teacher_model, MEMORY_STORE)

# Test hippocampus on contradictions
print("\nüß† Testing hippocampus on contradictions...")
print("   (Would need teacher_model initialized)")
# for item in contradictory_data:
#     decision, memory, metadata = hippocampus.process(item["person"], {"fact": item["fact"]})
#     print(f"   {item['fact'][:40]}... ‚Üí {decision} ({metadata.get('decision_reason', '')})")

print("\nüí° Expected: Hippocampus detects and rejects contradictions")
print("   First fact (1961) should be STORED")
print("   Second fact (1867) should be REJECTED (contradicts 1961)")

## Test 4: Rapid Context Switching

In [None]:
# Test 4: Rapid Context Switching
print("\n" + "="*70)
print("STRESS TEST 4: Rapid Context Switching")
print("="*70)

# Load people data
PEOPLE = load_people_data("configs/people_data.yaml")

# Create worst-case interleaving (switch person every example)
training_data_rapid = []
for i in range(60):  # 60 examples = 20 per person
    person = PEOPLE[i % 3]
    if person.get("facts") and i // 3 < len(person["facts"]):
        fact = person["facts"][i // 3]
        training_data_rapid.append({"person": person, "fact": fact})

print(f"‚úÖ Created {len(training_data_rapid)} examples with rapid switching")
if training_data_rapid:
    pattern = ' ‚Üí '.join([training_data_rapid[i]['person']['id'][0].upper() for i in range(min(12, len(training_data_rapid)))])
    print(f"   Pattern: {pattern}")

# Train
# print("\nüöÄ Training with rapid switching...")
# scores_rapid = train_and_evaluate(training_data_rapid)

# Expected: More challenging, but interleaving should still work
# print(f"\nüìä Results:")
# print(f"   Final score: {scores_rapid['overall']:.1%}")

print("\nüí° Expected: More challenging, but interleaving should still work")
print("   Rapid switching tests the model's ability to maintain separate memories")

## Summary

In [None]:
# Summary
print("\n" + "="*70)
print("üèÅ STRESS TESTS COMPLETE")
print("="*70)

print("\nüìä Tests Created:")
print("   1. ‚úÖ Scale to 10 people - Tests interference with more people")
print("   2. ‚úÖ 50% noise facts - Tests hippocampus rejection capability")
print("   3. ‚úÖ Contradictory facts - Tests contradiction detection")
print("   4. ‚úÖ Rapid context switching - Tests memory separation")

print("\nüí° Fill in training and evaluation code to run actual stress tests")
print("   These tests push the system to its limits to find failure modes")