## 1. Configuration
Edit the variables below. For Ollama, ensure the model has been pulled (e.g., `ollama pull <model>`). For LM Studio, ensure the model is loaded and API server is running.

In [None]:
# Backend: 'lmstudio' or 'ollama'
BACKEND = 'lmstudio'
# List of local model names to evaluate
MODEL_NAMES = [
    'your-model-name-here'  # replace with actual model id
]
# Prompt set path (relative to repo root)
PROMPT_SET = 'Evaluator/prompts/baseline.json'
# Optional: limit number of prompts for quick smoke tests (None = all)
LIMIT = None
# Sampling settings (tweak as needed)
TEMPERATURE = 0.2
TOP_P = 0.9
MAX_TOKENS = 768
SEED = None  # set an int for reproducibility
# Output directory (results will auto-nest here)
RESULTS_DIR = 'Evaluator/results'
# Set to True to skip actual backend calls (structure only)
DRY_RUN = False
# Validate context IDs embedded in prompts (requires prompts with expected_context)
VALIDATE_CONTEXT = False

## 2. Imports
Imports core evaluator modules. If an ImportError occurs, ensure you run this notebook from the project root or add the project root to `sys.path`.

In [None]:
import sys, os, json, datetime, pathlib
from Evaluator.prompt_sets import load_prompt_cases, filter_prompts
from Evaluator.config import EvaluatorConfig, PromptFilter
from Evaluator.client_factory import create_client, create_settings
from Evaluator.runner import evaluate_cases
from Evaluator.reporting import (
    build_run_payload, write_json, render_markdown, console_summary, build_metadata
)
print('Imports successful.')

## 3. Helper Functions
Utilities for timestamped paths and performing a single model evaluation.

In [None]:
def timestamp():
    return datetime.datetime.utcnow().strftime('%Y%m%d_%H%M%S')

def evaluate_model(model_name: str):
    prompt_path = pathlib.Path(PROMPT_SET)
    cases = load_prompt_cases(prompt_path)
    f = PromptFilter(tags=None, limit=LIMIT)
    selected = filter_prompts(cases, f)
    if not selected:
        raise ValueError('No prompts selected; adjust LIMIT or tags.')

    # Build settings and client
    settings = create_settings(
        backend=BACKEND,
        model=model_name,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        max_tokens=MAX_TOKENS,
        seed=SEED,
    )
    client = create_client(
        backend=BACKEND,
        settings=settings,
        timeout=60.0,
        retries=2,
    )

    print(f'Running {len(selected)} prompts for model: {model_name}')
    records = evaluate_cases(
        selected,
        client=client,
        dry_run=DRY_RUN,
        validate_context=VALIDATE_CONTEXT,
        on_record=lambda r: print(f'  {r.case.case_id or 
} -> {
 if r.passed else 
}')
    )

    # Prepare output paths
    pathlib.Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)
    run_tag = f'{model_name.replace('/', '_')}_{timestamp()}'
    json_path = pathlib.Path(RESULTS_DIR) / f'notebook_run_{run_tag}.json'
    md_path = pathlib.Path(RESULTS_DIR) / f'notebook_run_{run_tag}.md'

    # Build metadata + payload
    config = EvaluatorConfig(
        prompts_path=prompt_path,
        output_path=json_path,
        save_markdown=True,
        filter=f,
        retries=2,
        request_timeout=60.0,
        dry_run=DRY_RUN,
    )
    metadata = build_metadata(config, settings, len(cases), len(selected), BACKEND)
    payload = build_run_payload(records, metadata=metadata)
    write_json(json_path, payload)
    md_path.write_text(render_markdown(records, model_name, prompt_path.name), encoding='utf-8')
    summary = console_summary(records)
    print(summary)
    return {
        'model': model_name,
        'json': str(json_path),
        'markdown': str(md_path),
        'summary': summary,
    }

print('Helper functions ready.')

## 4. Run Evaluations
Executes evaluations for each model in `MODEL_NAMES`. Skip or modify the list for single-model runs.

In [None]:
results = []
for m in MODEL_NAMES:
    try:
        result = evaluate_model(m)
        results.append(result)
    except Exception as e:
        print(f'Error evaluating {m}: {e}')

print('Completed evaluations.')
results

## 5. (Optional) Simple Comparison View
Creates a lightweight comparison dictionary. For deeper analysis you can load JSON payloads into pandas or another tool.

In [None]:
comparison = {}
for entry in results:
    # Extract pass rate from summary string (simple parse)
    lines = entry['summary'].splitlines()
    pass_line = next((l for l in lines if 'Pass rate' in l), None)
    comparison[entry['model']] = {
        'json': entry['json'],
        'markdown': entry['markdown'],
        'pass_line': pass_line,
    }
comparison

## 6. Next Steps
- Review Markdown files in `Evaluator/results/`.
- Open a PR with those files and a short note about qualitative behavior.
- Suggest additional prompt tags or new prompt cases if you noticed blind spots.

Thank you for helping improve model selection for the Obsidian Nexus plugin!