# RLM Evaluation Suite

Comprehensive diagnostic and evaluation notebook for fleet-rlm.

Use this notebook to:
1. Validate Modal environment setup
2. Test basic sandbox functionality
3. Run full dspy.RLM evaluations
4. Visualize and analyze trajectories
5. Benchmark against baselines

## Cell 1: Environment Diagnostics

In [1]:
# Check environment setup
import os
import sys

print("=== RLM Environment Diagnostics ===")
print(f"Python: {sys.version}")
print(f"Working directory: {os.getcwd()}")

# Check Modal credentials
has_modal_token = bool(os.environ.get("MODAL_TOKEN_ID")) and bool(
    os.environ.get("MODAL_TOKEN_SECRET")
)
print(f"\nModal credentials: {'✓ Configured' if has_modal_token else '✗ Missing'}")

if has_modal_token:
    try:
        from fleet_rlm.runners import check_secret_presence

        secrets = check_secret_presence()
        print("\nLITELLM Secret Contents:")
        for key, present in secrets.items():
            print(f"  {key}: {'✓' if present else '✗'}")

        all_present = all(secrets.values())
        print(
            f"\nOverall: {'✓ Ready for RLM testing' if all_present else '✗ Missing required secrets'}"
        )
    except Exception as e:
        print(f"\n✗ Error checking secrets: {e}")
else:
    print("\nSet MODAL_TOKEN_ID and MODAL_TOKEN_SECRET to enable testing")

=== RLM Environment Diagnostics ===
Python: 3.13.9 (main, Nov 19 2025, 23:39:32) [Clang 21.1.4 ]
Working directory: /Volumes/Samsung-SSD-T7/Workspaces/Github/qredence/agent-framework/v0.5/_WORLD/_RLM/fleet-rlm-dspy/notebooks

Modal credentials: ✓ Configured

LITELLM Secret Contents:
  DSPY_LM_MODEL: ✓
  DSPY_LM_API_BASE: ✓
  DSPY_LLM_API_KEY: ✓
  DSPY_LM_MAX_TOKENS: ✓

Overall: ✓ Ready for RLM testing


## Cell 2: Sandbox Health Check

In [2]:
# Test basic sandbox functionality
from fleet_rlm import ModalInterpreter

print("=== Sandbox Health Check ===")

try:
    interpreter = ModalInterpreter(timeout=60)
    interpreter.start()

    # Test basic execution
    result = interpreter.execute("""
import sys
print(f"Python: {sys.version}")
print(f"Platform: {sys.platform}")
SUBMIT(status="healthy", python_version=sys.version_info[:2])
""")

    # Use attribute access for FinalOutput
    output = getattr(result, "output", result)
    print(f"Sandbox status: {output.get('status')}")
    print(f"Python version: {output.get('python_version')}")

    interpreter.shutdown()
    print("\n✓ Sandbox health check passed")

except Exception as e:
    print(f"\n✗ Sandbox health check failed: {e}")

=== Sandbox Health Check ===
Sandbox status: healthy
Python version: [3, 12]

✓ Sandbox health check passed


## Cell 3: Variable Space Test

In [3]:
# Test variable persistence
print("=== Variable Space Test ===")

try:
    interpreter = ModalInterpreter(timeout=60)
    interpreter.start()

    # Set variables
    interpreter.execute("data = {'items': [1, 2, 3], 'count': 3}")
    print("✓ Variables set")

    # Modify and retrieve
    result = interpreter.execute("data['items'].append(4)\nSUBMIT(data)")
    # Use attribute access for FinalOutput
    output = getattr(result, "output", result)

    print(f"✓ Variables modified: {output}")
    assert output.get("items") == [1, 2, 3, 4], "Variable persistence failed"

    interpreter.shutdown()
    print("\n✓ Variable space test passed")

except Exception as e:
    print(f"\n✗ Variable space test failed: {e}")

=== Variable Space Test ===
✓ Variables set
✓ Variables modified: {'output': {'items': [1, 2, 3, 4], 'count': 3}}

✗ Variable space test failed: Variable persistence failed


## Cell 4: Full dspy.RLM Test

In [4]:
# Test full dspy.RLM integration
import dspy
from fleet_rlm import ModalInterpreter, configure_planner_from_env

print("=== dspy.RLM Integration Test ===")

try:
    # Configure the LM from environment variables
    configure_planner_from_env()

    interpreter = ModalInterpreter(timeout=120)

    rlm = dspy.RLM(
        signature="question -> answer",
        interpreter=interpreter,
        max_iterations=5,
        max_llm_calls=10,
        verbose=True,  # Show trajectory
    )

    result = rlm(question="What is 2 + 2? Calculate using Python.")

    print(f"\nAnswer: {result.answer}")
    print(f"Trajectory steps: {len(getattr(result, 'trajectory', []))}")

    interpreter.shutdown()
    print("\n✓ dspy.RLM integration test passed")

except Exception as e:
    print(f"\n✗ dspy.RLM integration test failed: {e}")
    import traceback

    traceback.print_exc()

=== dspy.RLM Integration Test ===


2026/02/08 09:14:03 INFO dspy.predict.rlm: RLM iteration 1/5
Reasoning: The question asks for the result of "2 + 2" calculated using Python. I will perform this calculation and print the result to confirm.
Code:
```python
result = 2 + 2
print(result)
```
2026/02/08 09:14:05 INFO dspy.predict.rlm: RLM iteration 2/5
Reasoning: The calculation in the previous step confirmed that 2 + 2 = 4. I will now submit the answer.
Code:
```python
SUBMIT(4)
```



Answer: 4
Trajectory steps: 2

✓ dspy.RLM integration test passed


## Cell 5: Trajectory Visualization

In [5]:
# Visualize RLM trajectory
def visualize_trajectory(trajectory):
    """Pretty-print RLM trajectory for analysis."""
    if not trajectory:
        print("No trajectory data")
        return

    for i, step in enumerate(trajectory, 1):
        print(f"\n{'=' * 60}")
        print(f"Step {i}")
        print(f"{'=' * 60}")

        if "reasoning" in step:
            reasoning = str(step["reasoning"])[:300]
            print(f"\nReasoning:\n{reasoning}...")

        if "code" in step:
            code = str(step["code"])[:200]
            print(f"\nCode:\n{code}...")

        if "output" in step:
            output = str(step["output"])[:150]
            print(f"\nOutput: {output}...")


# Run a task and visualize
print("=== Trajectory Visualization ===")

try:
    from fleet_rlm import configure_planner_from_env

    # Configure the LM from environment variables
    configure_planner_from_env()

    interpreter = ModalInterpreter(timeout=120)

    rlm = dspy.RLM(
        signature="text -> summary",
        interpreter=interpreter,
        max_iterations=10,
        max_llm_calls=15,
        verbose=False,
    )

    # Sample text
    text = """
    The quick brown fox jumps over the lazy dog.
    This is a sample text for the RLM to analyze.
    It should identify key information and summarize.
    """

    result = rlm(text=text)
    trajectory = getattr(result, "trajectory", [])

    visualize_trajectory(trajectory)

    interpreter.shutdown()

except Exception as e:
    print(f"Error: {e}")

=== Trajectory Visualization ===

Step 1

Reasoning:
I will begin by inspecting the content of the `text` variable to understand its structure and information, then I will use the `llm_query` function to generate a concise summary of the text....

Code:
print(f"Text length: {len(text)}")
print("Content preview:")
print(text)

prompt = f"Summarize the following text in one short sentence:\n\n{text}"
summary = llm_query(prompt)
print(f"\nGenerated Summ...

Output: Text length: 158
Content preview:

    The quick brown fox jumps over the lazy dog.
    This is a sample text for the RLM to analyze.
    It should id...

Step 2

Reasoning:
The initial summary generated is accurate and captures the essence of the provided text. I have explored the input and verified the output from the `llm_query`. I am now ready to submit the final summary....

Code:
# The summary was generated in the previous step.
# I will store it in a variable to ensure I pass the exact string to SUBMIT.
final_summary = "

## Cell 6: Needle in Haystack Benchmark

In [9]:
# Classic RLM benchmark
import time
from fleet_rlm import configure_planner_from_env

print("=== Needle in Haystack Benchmark ===")

try:
    # Configure the LM from environment variables
    configure_planner_from_env()

    # Create large document
    lines = [f"Line {i}: filler content here" for i in range(500)]
    lines[250] = "Line 250: SECRET_NEEDLE_FOUND_HERE"
    haystack = "\n".join(lines)

    interpreter = ModalInterpreter(timeout=180)
    interpreter.start()
    interpreter.execute(f"docs = {haystack!r}")

    rlm = dspy.RLM(
        signature="find -> location",
        interpreter=interpreter,
        max_iterations=10,
        max_llm_calls=15,
        verbose=False,
    )

    start = time.time()
    result = rlm(find="SECRET_NEEDLE")
    elapsed = time.time() - start

    trajectory = getattr(result, "trajectory", [])

    print(f"\nResult: {result.location}")
    print(f"Iterations: {len(trajectory)}")
    print(f"Duration: {elapsed:.2f}s")

    # Validate
    assert len(trajectory) <= 5, f"Took {len(trajectory)} iterations, expected <= 5"
    assert elapsed < 60, f"Took {elapsed}s, expected < 60s"

    interpreter.shutdown()
    print("\n✓ Benchmark passed")

except Exception as e:
    print(f"\n✗ Benchmark failed: {e}")

=== Needle in Haystack Benchmark ===

Result: /tmp/secret_needle.txt
Iterations: 6
Duration: 27.45s

✗ Benchmark failed: Took 6 iterations, expected <= 5


## Cell 7: Volume Persistence Test

In [7]:
# Test volume persistence across sessions
print("=== Volume Persistence Test ===")

VOLUME_NAME = "rlm-eval-test-volume"

try:
    # Session 1: Write data
    print("Session 1: Writing data...")
    interpreter1 = ModalInterpreter(timeout=60, volume_name=VOLUME_NAME)
    interpreter1.start()

    result1 = interpreter1.execute("""
import json
data = {'test': 'persistence', 'timestamp': __import__('time').time()}
with open('/data/persist_test.json', 'w') as f:
    json.dump(data, f)
SUBMIT(status='written')
""")

    # Use attribute access for FinalOutput
    output1 = getattr(result1, "output", result1)
    print(f"  Status: {output1.get('status')}")
    interpreter1.shutdown()

    # Session 2: Read data
    print("\nSession 2: Reading data...")
    interpreter2 = ModalInterpreter(timeout=60, volume_name=VOLUME_NAME)
    interpreter2.start()

    result2 = interpreter2.execute("""
import json
with open('/data/persist_test.json', 'r') as f:
    data = json.load(f)
SUBMIT(data)
""")

    # Use attribute access for FinalOutput
    output2 = getattr(result2, "output", result2)
    print(f"  Data: {output2}")
    interpreter2.shutdown()

    # Validate
    assert output2.get("test") == "persistence", "Volume persistence failed"
    print("\n✓ Volume persistence test passed")

except Exception as e:
    print(f"\n✗ Volume persistence test failed: {e}")

=== Volume Persistence Test ===
Session 1: Writing data...
  Status: written

Session 2: Reading data...
  Data: {'output': {'test': 'persistence', 'timestamp': 1770538643.0586412}}

✗ Volume persistence test failed: Volume persistence failed


## Cell 8: Tool Registration Test

In [11]:
# Test custom tool registration
print("=== Tool Registration Test ===")

try:
    from fleet_rlm.tools import regex_extract

    interpreter = ModalInterpreter(timeout=60)
    # Register tools via the tools property BEFORE starting
    interpreter.tools = {"regex_extract": regex_extract}
    interpreter.start()

    # Test built-in tool
    text = """
# Header 1
Content here
# Header 2
More content
"""

    result = interpreter.execute(f"""
import re
headers = regex_extract({text!r}, r'^# (.+)$', re.MULTILINE)
SUBMIT(headers=headers)
""")

    # Use attribute access for FinalOutput
    headers = getattr(result, "output", result)
    print(f"Extracted headers: {headers}")

    assert "Header 1" in str(headers)
    assert "Header 2" in str(headers)

    interpreter.shutdown()
    print("\n✓ Tool registration test passed")

except Exception as e:
    print(f"\n✗ Tool registration test failed: {e}")

=== Tool Registration Test ===
Extracted headers: {'headers': ['Header 1', 'Header 2']}

✓ Tool registration test passed
